In [ ]:
print("prelim stats")
prelim stats
In [ ]:
#import libraries

import csv
import pandas as pd
import numpy as np
In [ ]:
#import and read csv file
#display first 10 rows

import pandas as pd
crime_data1 = pd.read_csv(r"C:\Users\radon\Documents\CIND820/crimedata.csv")

crime_data1.head(10)
Out[ ]:
communityname state countyCode communityCode fold population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct12t29 agePct16t24 agePct65up numbUrban pctUrban medIncome pctWWage pctWFarmSelf pctWInvInc pctWSocSec pctWPubAsst pctWRetire medFamInc perCapInc whitePerCap blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap NumUnderPov PctPopUnderPov PctLess9thGrade PctNotHSGrad PctBSorMore PctUnemployed PctEmploy PctEmplManu PctEmplProfServ PctOccupManu PctOccupMgmtProf MalePctDivorce MalePctNevMarr FemalePctDiv TotalPctDiv PersPerFam PctFam2Par PctKids2Par PctYoungKids2Par PctTeen2Par PctWorkMomYoungKids PctWorkMom NumKidsBornNeverMar PctKidsBornNeverMar NumImmig PctImmigRecent PctImmigRec5 PctImmigRec8 PctImmigRec10 PctRecentImmig PctRecImmig5 PctRecImmig8 PctRecImmig10 PctSpeakEnglOnly PctNotSpeakEnglWell PctLargHouseFam PctLargHouseOccup PersPerOccupHous PersPerOwnOccHous PersPerRentOccHous PctPersOwnOccup PctPersDenseHous PctHousLess3BR MedNumBR HousVacant PctHousOccup PctHousOwnOcc PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctHousNoPhone PctWOFullPlumb OwnOccLowQuart OwnOccMedVal OwnOccHiQuart OwnOccQrange RentLowQ RentMedian RentHighQ RentQrange MedRent MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg NumInShelters NumStreet PctForeignBorn PctBornSameState PctSameHouse85 PctSameCity85 PctSameState85 LemasSwornFT LemasSwFTPerPop LemasSwFTFieldOps LemasSwFTFieldPerPop LemasTotalReq LemasTotReqPerPop PolicReqPerOffic PolicPerPop RacialMatchCommPol PctPolicWhite PctPolicBlack PctPolicHisp PctPolicAsian PctPolicMinor OfficAssgnDrugUnits NumKindsDrugsSeiz PolicAveOTWorked LandArea PopDens PctUsePubTrans PolicCars PolicOperBudg LemasPctPolicOnPatr LemasGangUnitDeploy LemasPctOfficDrugUn PolicBudgPerPop murders murdPerPop rapes rapesPerPop robberies robbbPerPop assaults assaultPerPop burglaries burglPerPop larcenies larcPerPop autoTheft autoTheftPerPop arsons arsonsPerPop ViolentCrimesPerPop nonViolPerPop
0 BerkeleyHeightstownship NJ 39 5320 1 11980 3.10 1.37 91.78 6.50 1.88 12.47 21.44 10.93 11.33 11980 100.0 75122 89.24 1.55 70.20 23.62 1.03 18.39 79584 29711 30233 13600 5725 27101 5115 22838 227 1.96 5.81 9.90 48.18 2.70 64.55 14.65 28.82 5.49 50.73 3.67 26.38 5.22 4.47 3.22 91.43 90.17 95.78 95.81 44.56 58.88 31 0.36 1277 8.69 13.00 20.99 30.93 0.93 1.39 2.24 3.30 85.68 1.37 4.81 4.17 2.99 3.00 2.84 91.46 0.39 11.06 3 64 98.37 91.01 3.12 37.50 1959 0.00 0.28 215900 262600 326900 111000 685 1001 1001 316 1001 23.8 21.1 14.0 11 0 10.66 53.72 65.29 78.09 89.14 ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? 6.5 1845.9 9.63 ? ? ? ? 0.00 ? 0 0.00 0 0 1 8.2 4 32.81 14 114.85 138 1132.08 16 131.26 2 16.41 41.02 1394.59
1 Marpletownship PA 45 47616 1 23123 2.82 0.80 95.57 3.44 0.85 11.01 21.30 10.48 17.18 23123 100.0 47917 78.99 1.11 64.11 35.50 2.75 22.85 55323 20148 20191 18137 0 20074 5250 12222 885 3.98 5.61 13.72 29.89 2.43 61.96 12.26 29.28 6.39 37.64 4.23 27.99 6.45 5.42 3.11 86.91 85.33 96.82 86.46 51.14 62.43 43 0.24 1920 5.21 8.65 13.33 22.50 0.43 0.72 1.11 1.87 87.79 1.81 4.25 3.34 2.70 2.83 1.96 89.03 1.01 23.60 3 240 97.15 84.88 0.00 18.33 1958 0.31 0.14 136300 164200 199900 63600 467 560 672 205 627 27.6 20.7 12.5 0 0 8.30 77.17 71.27 90.22 96.12 ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? 10.6 2186.7 3.84 ? ? ? ? 0.00 ? 0 0.00 1 4.25 5 21.26 24 102.05 57 242.37 376 1598.78 26 110.55 1 4.25 127.56 1955.95
2 Tigardcity OR ? ? 1 29344 2.43 0.74 94.33 3.43 2.35 11.36 25.88 11.01 10.28 29344 100.0 35669 82.00 1.15 55.73 22.25 2.94 14.56 42112 16946 17103 16644 21606 15528 5954 8405 1389 4.75 2.80 9.09 30.13 4.01 69.80 15.95 21.52 8.79 32.48 10.10 25.78 14.76 12.55 2.95 78.54 78.85 92.37 75.72 66.08 74.19 164 0.88 1468 16.42 23.98 32.08 35.63 0.82 1.20 1.61 1.78 93.11 1.14 2.97 2.05 2.42 2.69 2.06 64.18 2.03 47.46 3 544 95.68 57.79 0.92 7.54 1976 1.55 0.12 74700 90400 112000 37300 370 428 520 150 484 24.1 21.7 11.6 16 0 5.00 44.77 36.60 61.26 82.85 ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? 10.6 2780.9 4.37 ? ? ? ? 0.00 ? 3 8.30 6 16.6 56 154.95 14 38.74 274 758.14 1797 4972.19 136 376.3 22 60.87 218.59 6167.51
3 Gloversvillecity NY 35 29443 1 16656 2.40 1.70 97.35 0.50 0.70 12.55 25.20 12.19 17.57 0 0.0 20580 68.15 0.24 38.95 39.48 11.71 18.33 26501 10810 10909 9984 4941 3541 2451 4391 2831 17.23 11.05 33.68 10.81 9.86 54.74 31.22 27.43 26.76 22.71 10.98 28.15 14.47 12.91 2.98 64.02 62.36 65.38 67.43 59.59 70.27 561 3.84 339 13.86 13.86 15.34 15.34 0.28 0.28 0.31 0.31 94.98 0.56 3.93 2.56 2.37 2.51 2.20 58.18 1.21 45.66 3 669 91.19 54.89 2.54 57.85 1939 7.00 0.87 36400 49600 66500 30100 195 250 309 114 333 28.7 20.6 14.5 0 0 2.04 88.71 56.70 90.17 96.24 ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? 5.2 3217.7 3.31 ? ? ? ? 0.00 ? 0 0.00 10 57.86 10 57.86 33 190.93 225 1301.78 716 4142.56 47 271.93 ? ? 306.64 ?
4 Bemidjicity MN 7 5068 1 11245 2.76 0.53 89.16 1.17 0.52 24.46 40.53 28.69 12.65 0 0.0 17390 69.33 0.55 42.82 32.16 11.21 14.43 24018 8483 9009 887 4425 3352 3000 1328 2855 29.99 12.15 23.06 25.28 9.08 52.44 6.89 36.54 10.94 27.80 7.51 50.66 11.64 9.73 2.98 58.59 55.20 66.51 79.17 61.22 68.94 402 4.70 196 46.94 56.12 67.86 69.90 0.82 0.98 1.18 1.22 94.64 0.39 5.23 3.11 2.35 2.55 2.12 58.13 2.94 55.64 2 333 92.45 53.57 3.90 42.64 1958 7.45 0.82 30600 43200 59500 28900 202 283 362 160 332 32.2 23.2 12.9 2 0 1.74 73.75 42.22 60.34 89.02 ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? 11.5 974.2 0.38 ? ? ? ? 0.00 ? 0 0.00 ? ? 4 32.04 14 112.14 91 728.93 1060 8490.87 91 728.93 5 40.05 ? 9988.79
5 Springfieldcity MO ? ? 1 140494 2.45 2.51 95.65 0.90 0.95 18.09 32.89 20.04 13.26 140494 100.0 21577 75.78 1.00 41.15 29.31 7.12 14.09 27705 11878 12029 7382 10264 10753 7192 8104 23223 17.78 8.76 23.03 20.66 5.72 59.02 14.31 26.83 14.72 23.42 11.40 33.32 14.46 13.04 2.89 71.94 69.79 79.76 75.33 62.96 70.52 1511 1.58 2091 21.33 30.56 38.02 45.48 0.32 0.45 0.57 0.68 96.87 0.60 3.08 1.92 2.28 2.37 2.16 57.81 2.11 53.19 2 5119 91.81 55.50 2.09 26.22 1966 6.13 0.31 37700 53900 73100 35400 215 280 349 134 340 26.4 17.3 11.7 327 4 1.49 64.35 42.29 70.61 85.66 ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? 70.4 1995.7 0.97 ? ? ? ? 0.00 ? 7 4.63 77 50.98 136 90.05 449 297.29 2094 1386.46 7690 5091.64 454 300.6 134 88.72 442.95 6867.42
6 Norwoodtown MA 21 50250 1 28700 2.60 1.60 96.57 1.47 1.10 11.17 27.41 12.76 14.42 28700 100.0 42805 79.47 0.39 47.70 30.23 5.41 17.23 50394 18193 18276 17342 21482 12639 21852 22594 1126 4.01 4.49 13.89 27.01 4.85 65.42 14.02 27.17 8.50 32.78 5.97 36.05 9.06 7.64 3.14 79.53 79.76 92.05 77.12 65.16 72.81 263 1.18 2637 11.38 16.27 23.93 27.76 1.05 1.49 2.20 2.55 89.98 0.60 5.08 3.46 2.55 2.89 2.09 64.62 1.47 47.35 3 566 95.11 56.96 1.41 34.45 1956 0.69 0.28 155100 179000 215500 60400 463 669 824 361 736 24.4 20.8 12.5 0 0 9.19 77.30 63.45 82.23 93.53 ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? 10.9 2643.5 9.62 ? ? ? ? 0.00 ? 0 0.00 4 13.53 9 30.44 54 182.66 110 372.09 288 974.19 144 487.1 17 57.5 226.63 1890.88
7 Andersoncity IN ? ? 1 59459 2.45 14.20 84.87 0.40 0.63 15.31 27.93 14.78 14.60 59449 100.0 23221 71.60 0.67 35.74 32.58 8.81 22.59 28901 12161 12599 9820 6634 8802 7428 6187 10320 17.98 10.09 28.67 12.00 8.19 56.59 27.00 21.54 21.92 18.02 13.28 28.34 16.33 14.94 2.95 62.56 58.70 69.89 62.76 63.08 72.44 2368 4.66 517 13.15 22.82 28.24 33.08 0.11 0.20 0.25 0.29 97.43 0.28 3.85 2.55 2.36 2.42 2.27 65.29 1.90 56.30 2 2051 92.22 63.82 6.39 56.36 1954 8.42 0.49 26300 37000 52400 26100 186 253 325 139 338 26.3 15.1 12.2 21 0 0.87 73.70 54.85 85.55 91.51 ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? 39.2 1515.3 0.70 ? ? ? ? 0.00 ? 8 13.13 34 55.79 98 160.8 128 210.02 608 997.6 2250 3691.79 125 205.1 9 14.77 439.73 4909.26
8 Fargocity ND 17 25700 1 74111 2.46 0.35 97.11 1.25 0.73 16.64 35.16 20.33 8.58 74115 100.0 25326 83.69 2.93 47.11 19.30 4.21 10.31 34269 13554 13727 8852 5344 8011 5332 5174 9603 13.68 5.52 11.27 30.24 4.18 68.51 6.89 31.55 11.37 29.43 7.29 40.87 9.94 8.64 3.00 79.35 79.70 86.60 80.70 74.32 78.51 751 1.64 1474 23.68 33.58 46.68 53.93 0.47 0.67 0.93 1.07 95.21 0.43 2.59 1.54 2.32 2.77 1.91 57.42 1.67 59.32 2 1562 95.07 48.10 0.45 25.61 1971 2.66 0.19 54500 70300 93700 39200 241 321 387 146 355 25.2 20.7 12.8 125 15 1.99 58.82 40.72 67.97 81.39 ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? 30.9 2399.3 1.41 ? ? ? ? 0.00 ? 0 0.00 35 43.87 16 20.05 41 51.39 425 532.66 3149 3946.71 206 258.18 8 10.03 115.31 4747.58
9 Wacocity TX ? ? 1 103590 2.62 23.14 67.60 0.92 16.35 19.88 34.55 21.62 13.12 103590 100.0 17852 74.20 0.86 30.98 29.09 9.06 13.99 24058 10195 12126 5715 11313 5770 7320 6984 27767 28.68 13.01 31.62 17.02 8.39 51.37 15.73 29.06 16.43 24.30 11.07 38.49 14.66 12.97 3.11 61.65 54.56 68.85 61.69 60.80 69.23 3537 4.71 4793 15.54 23.08 35.32 49.82 0.72 1.07 1.63 2.31 85.72 2.51 6.70 4.10 2.45 2.47 2.44 46.82 6.14 59.96 2 5606 87.57 46.51 5.64 37.57 1960 11.74 0.33 28600 43100 67400 38800 192 281 369 177 353 29.6 19.4 13.0 43 4 4.63 75.59 42.33 74.05 92.12 198 183.53 187 173.33 73432 68065.1 370.9 183.5 89.32 78.28 11.11 10.61 0 21.72 13 12 60.2 78.5 1319.3 0.76 100 9315474 94.44 10 6.57 86346.3 29 26.88 141 130.69 453 419.89 1043 966.77 2397 2221.81 6121 5673.63 1070 991.8 18 16.68 1544.24 8903.93
In [ ]:
#display data types of all columns

datatypes = crime_data1.dtypes
print(datatypes)
In [ ]:
#replace all missing values with NaN

import pandas as pd
crime_data = pd.read_csv('crimedata.csv', na_values=['?'])
crime_data.head(10)
Out[ ]:
communityname state countyCode communityCode fold population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct12t29 agePct16t24 agePct65up numbUrban pctUrban medIncome pctWWage pctWFarmSelf pctWInvInc pctWSocSec pctWPubAsst pctWRetire medFamInc perCapInc whitePerCap blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap NumUnderPov PctPopUnderPov PctLess9thGrade PctNotHSGrad PctBSorMore PctUnemployed PctEmploy PctEmplManu PctEmplProfServ PctOccupManu PctOccupMgmtProf MalePctDivorce MalePctNevMarr FemalePctDiv TotalPctDiv PersPerFam PctFam2Par PctKids2Par PctYoungKids2Par PctTeen2Par PctWorkMomYoungKids PctWorkMom NumKidsBornNeverMar PctKidsBornNeverMar NumImmig PctImmigRecent PctImmigRec5 PctImmigRec8 PctImmigRec10 PctRecentImmig PctRecImmig5 PctRecImmig8 PctRecImmig10 PctSpeakEnglOnly PctNotSpeakEnglWell PctLargHouseFam PctLargHouseOccup PersPerOccupHous PersPerOwnOccHous PersPerRentOccHous PctPersOwnOccup PctPersDenseHous PctHousLess3BR MedNumBR HousVacant PctHousOccup PctHousOwnOcc PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctHousNoPhone PctWOFullPlumb OwnOccLowQuart OwnOccMedVal OwnOccHiQuart OwnOccQrange RentLowQ RentMedian RentHighQ RentQrange MedRent MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg NumInShelters NumStreet PctForeignBorn PctBornSameState PctSameHouse85 PctSameCity85 PctSameState85 LemasSwornFT LemasSwFTPerPop LemasSwFTFieldOps LemasSwFTFieldPerPop LemasTotalReq LemasTotReqPerPop PolicReqPerOffic PolicPerPop RacialMatchCommPol PctPolicWhite PctPolicBlack PctPolicHisp PctPolicAsian PctPolicMinor OfficAssgnDrugUnits NumKindsDrugsSeiz PolicAveOTWorked LandArea PopDens PctUsePubTrans PolicCars PolicOperBudg LemasPctPolicOnPatr LemasGangUnitDeploy LemasPctOfficDrugUn PolicBudgPerPop murders murdPerPop rapes rapesPerPop robberies robbbPerPop assaults assaultPerPop burglaries burglPerPop larcenies larcPerPop autoTheft autoTheftPerPop arsons arsonsPerPop ViolentCrimesPerPop nonViolPerPop
0 BerkeleyHeightstownship NJ 39.0 5320.0 1 11980 3.10 1.37 91.78 6.50 1.88 12.47 21.44 10.93 11.33 11980 100.0 75122 89.24 1.55 70.20 23.62 1.03 18.39 79584 29711 30233 13600 5725 27101 5115.0 22838 227 1.96 5.81 9.90 48.18 2.70 64.55 14.65 28.82 5.49 50.73 3.67 26.38 5.22 4.47 3.22 91.43 90.17 95.78 95.81 44.56 58.88 31 0.36 1277 8.69 13.00 20.99 30.93 0.93 1.39 2.24 3.30 85.68 1.37 4.81 4.17 2.99 3.00 2.84 91.46 0.39 11.06 3 64 98.37 91.01 3.12 37.50 1959 0.00 0.28 215900 262600 326900 111000 685 1001 1001 316 1001 23.8 21.1 14.0 11 0 10.66 53.72 65.29 78.09 89.14 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6.5 1845.9 9.63 NaN NaN NaN NaN 0.00 NaN 0 0.00 0.0 0.00 1.0 8.20 4.0 32.81 14.0 114.85 138.0 1132.08 16.0 131.26 2.0 16.41 41.02 1394.59
1 Marpletownship PA 45.0 47616.0 1 23123 2.82 0.80 95.57 3.44 0.85 11.01 21.30 10.48 17.18 23123 100.0 47917 78.99 1.11 64.11 35.50 2.75 22.85 55323 20148 20191 18137 0 20074 5250.0 12222 885 3.98 5.61 13.72 29.89 2.43 61.96 12.26 29.28 6.39 37.64 4.23 27.99 6.45 5.42 3.11 86.91 85.33 96.82 86.46 51.14 62.43 43 0.24 1920 5.21 8.65 13.33 22.50 0.43 0.72 1.11 1.87 87.79 1.81 4.25 3.34 2.70 2.83 1.96 89.03 1.01 23.60 3 240 97.15 84.88 0.00 18.33 1958 0.31 0.14 136300 164200 199900 63600 467 560 672 205 627 27.6 20.7 12.5 0 0 8.30 77.17 71.27 90.22 96.12 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 10.6 2186.7 3.84 NaN NaN NaN NaN 0.00 NaN 0 0.00 1.0 4.25 5.0 21.26 24.0 102.05 57.0 242.37 376.0 1598.78 26.0 110.55 1.0 4.25 127.56 1955.95
2 Tigardcity OR NaN NaN 1 29344 2.43 0.74 94.33 3.43 2.35 11.36 25.88 11.01 10.28 29344 100.0 35669 82.00 1.15 55.73 22.25 2.94 14.56 42112 16946 17103 16644 21606 15528 5954.0 8405 1389 4.75 2.80 9.09 30.13 4.01 69.80 15.95 21.52 8.79 32.48 10.10 25.78 14.76 12.55 2.95 78.54 78.85 92.37 75.72 66.08 74.19 164 0.88 1468 16.42 23.98 32.08 35.63 0.82 1.20 1.61 1.78 93.11 1.14 2.97 2.05 2.42 2.69 2.06 64.18 2.03 47.46 3 544 95.68 57.79 0.92 7.54 1976 1.55 0.12 74700 90400 112000 37300 370 428 520 150 484 24.1 21.7 11.6 16 0 5.00 44.77 36.60 61.26 82.85 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 10.6 2780.9 4.37 NaN NaN NaN NaN 0.00 NaN 3 8.30 6.0 16.60 56.0 154.95 14.0 38.74 274.0 758.14 1797.0 4972.19 136.0 376.30 22.0 60.87 218.59 6167.51
3 Gloversvillecity NY 35.0 29443.0 1 16656 2.40 1.70 97.35 0.50 0.70 12.55 25.20 12.19 17.57 0 0.0 20580 68.15 0.24 38.95 39.48 11.71 18.33 26501 10810 10909 9984 4941 3541 2451.0 4391 2831 17.23 11.05 33.68 10.81 9.86 54.74 31.22 27.43 26.76 22.71 10.98 28.15 14.47 12.91 2.98 64.02 62.36 65.38 67.43 59.59 70.27 561 3.84 339 13.86 13.86 15.34 15.34 0.28 0.28 0.31 0.31 94.98 0.56 3.93 2.56 2.37 2.51 2.20 58.18 1.21 45.66 3 669 91.19 54.89 2.54 57.85 1939 7.00 0.87 36400 49600 66500 30100 195 250 309 114 333 28.7 20.6 14.5 0 0 2.04 88.71 56.70 90.17 96.24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.2 3217.7 3.31 NaN NaN NaN NaN 0.00 NaN 0 0.00 10.0 57.86 10.0 57.86 33.0 190.93 225.0 1301.78 716.0 4142.56 47.0 271.93 NaN NaN 306.64 NaN
4 Bemidjicity MN 7.0 5068.0 1 11245 2.76 0.53 89.16 1.17 0.52 24.46 40.53 28.69 12.65 0 0.0 17390 69.33 0.55 42.82 32.16 11.21 14.43 24018 8483 9009 887 4425 3352 3000.0 1328 2855 29.99 12.15 23.06 25.28 9.08 52.44 6.89 36.54 10.94 27.80 7.51 50.66 11.64 9.73 2.98 58.59 55.20 66.51 79.17 61.22 68.94 402 4.70 196 46.94 56.12 67.86 69.90 0.82 0.98 1.18 1.22 94.64 0.39 5.23 3.11 2.35 2.55 2.12 58.13 2.94 55.64 2 333 92.45 53.57 3.90 42.64 1958 7.45 0.82 30600 43200 59500 28900 202 283 362 160 332 32.2 23.2 12.9 2 0 1.74 73.75 42.22 60.34 89.02 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 11.5 974.2 0.38 NaN NaN NaN NaN 0.00 NaN 0 0.00 NaN NaN 4.0 32.04 14.0 112.14 91.0 728.93 1060.0 8490.87 91.0 728.93 5.0 40.05 NaN 9988.79
5 Springfieldcity MO NaN NaN 1 140494 2.45 2.51 95.65 0.90 0.95 18.09 32.89 20.04 13.26 140494 100.0 21577 75.78 1.00 41.15 29.31 7.12 14.09 27705 11878 12029 7382 10264 10753 7192.0 8104 23223 17.78 8.76 23.03 20.66 5.72 59.02 14.31 26.83 14.72 23.42 11.40 33.32 14.46 13.04 2.89 71.94 69.79 79.76 75.33 62.96 70.52 1511 1.58 2091 21.33 30.56 38.02 45.48 0.32 0.45 0.57 0.68 96.87 0.60 3.08 1.92 2.28 2.37 2.16 57.81 2.11 53.19 2 5119 91.81 55.50 2.09 26.22 1966 6.13 0.31 37700 53900 73100 35400 215 280 349 134 340 26.4 17.3 11.7 327 4 1.49 64.35 42.29 70.61 85.66 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 70.4 1995.7 0.97 NaN NaN NaN NaN 0.00 NaN 7 4.63 77.0 50.98 136.0 90.05 449.0 297.29 2094.0 1386.46 7690.0 5091.64 454.0 300.60 134.0 88.72 442.95 6867.42
6 Norwoodtown MA 21.0 50250.0 1 28700 2.60 1.60 96.57 1.47 1.10 11.17 27.41 12.76 14.42 28700 100.0 42805 79.47 0.39 47.70 30.23 5.41 17.23 50394 18193 18276 17342 21482 12639 21852.0 22594 1126 4.01 4.49 13.89 27.01 4.85 65.42 14.02 27.17 8.50 32.78 5.97 36.05 9.06 7.64 3.14 79.53 79.76 92.05 77.12 65.16 72.81 263 1.18 2637 11.38 16.27 23.93 27.76 1.05 1.49 2.20 2.55 89.98 0.60 5.08 3.46 2.55 2.89 2.09 64.62 1.47 47.35 3 566 95.11 56.96 1.41 34.45 1956 0.69 0.28 155100 179000 215500 60400 463 669 824 361 736 24.4 20.8 12.5 0 0 9.19 77.30 63.45 82.23 93.53 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 10.9 2643.5 9.62 NaN NaN NaN NaN 0.00 NaN 0 0.00 4.0 13.53 9.0 30.44 54.0 182.66 110.0 372.09 288.0 974.19 144.0 487.10 17.0 57.50 226.63 1890.88
7 Andersoncity IN NaN NaN 1 59459 2.45 14.20 84.87 0.40 0.63 15.31 27.93 14.78 14.60 59449 100.0 23221 71.60 0.67 35.74 32.58 8.81 22.59 28901 12161 12599 9820 6634 8802 7428.0 6187 10320 17.98 10.09 28.67 12.00 8.19 56.59 27.00 21.54 21.92 18.02 13.28 28.34 16.33 14.94 2.95 62.56 58.70 69.89 62.76 63.08 72.44 2368 4.66 517 13.15 22.82 28.24 33.08 0.11 0.20 0.25 0.29 97.43 0.28 3.85 2.55 2.36 2.42 2.27 65.29 1.90 56.30 2 2051 92.22 63.82 6.39 56.36 1954 8.42 0.49 26300 37000 52400 26100 186 253 325 139 338 26.3 15.1 12.2 21 0 0.87 73.70 54.85 85.55 91.51 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 39.2 1515.3 0.70 NaN NaN NaN NaN 0.00 NaN 8 13.13 34.0 55.79 98.0 160.80 128.0 210.02 608.0 997.60 2250.0 3691.79 125.0 205.10 9.0 14.77 439.73 4909.26
8 Fargocity ND 17.0 25700.0 1 74111 2.46 0.35 97.11 1.25 0.73 16.64 35.16 20.33 8.58 74115 100.0 25326 83.69 2.93 47.11 19.30 4.21 10.31 34269 13554 13727 8852 5344 8011 5332.0 5174 9603 13.68 5.52 11.27 30.24 4.18 68.51 6.89 31.55 11.37 29.43 7.29 40.87 9.94 8.64 3.00 79.35 79.70 86.60 80.70 74.32 78.51 751 1.64 1474 23.68 33.58 46.68 53.93 0.47 0.67 0.93 1.07 95.21 0.43 2.59 1.54 2.32 2.77 1.91 57.42 1.67 59.32 2 1562 95.07 48.10 0.45 25.61 1971 2.66 0.19 54500 70300 93700 39200 241 321 387 146 355 25.2 20.7 12.8 125 15 1.99 58.82 40.72 67.97 81.39 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 30.9 2399.3 1.41 NaN NaN NaN NaN 0.00 NaN 0 0.00 35.0 43.87 16.0 20.05 41.0 51.39 425.0 532.66 3149.0 3946.71 206.0 258.18 8.0 10.03 115.31 4747.58
9 Wacocity TX NaN NaN 1 103590 2.62 23.14 67.60 0.92 16.35 19.88 34.55 21.62 13.12 103590 100.0 17852 74.20 0.86 30.98 29.09 9.06 13.99 24058 10195 12126 5715 11313 5770 7320.0 6984 27767 28.68 13.01 31.62 17.02 8.39 51.37 15.73 29.06 16.43 24.30 11.07 38.49 14.66 12.97 3.11 61.65 54.56 68.85 61.69 60.80 69.23 3537 4.71 4793 15.54 23.08 35.32 49.82 0.72 1.07 1.63 2.31 85.72 2.51 6.70 4.10 2.45 2.47 2.44 46.82 6.14 59.96 2 5606 87.57 46.51 5.64 37.57 1960 11.74 0.33 28600 43100 67400 38800 192 281 369 177 353 29.6 19.4 13.0 43 4 4.63 75.59 42.33 74.05 92.12 198.0 183.53 187.0 173.33 73432.0 68065.1 370.9 183.5 89.32 78.28 11.11 10.61 0.0 21.72 13.0 12.0 60.2 78.5 1319.3 0.76 100.0 9315474.0 94.44 10.0 6.57 86346.3 29 26.88 141.0 130.69 453.0 419.89 1043.0 966.77 2397.0 2221.81 6121.0 5673.63 1070.0 991.80 18.0 16.68 1544.24 8903.93
In [ ]:
#check number of rows
len(crime_data)
Out[ ]:
2215
In [ ]:
#check datatypes again

datatypes2 = crime_data.dtypes
print(datatypes2)
In [ ]:
#display the sum of all NaN in each column

crime_data.isnull().sum()
In [ ]:
#seperate all columns with missing values

null_cols = crime_data[crime_data.columns[crime_data.isna().any()]]
null_cols.head()
Out[ ]:
countyCode communityCode OtherPerCap LemasSwornFT LemasSwFTPerPop LemasSwFTFieldOps LemasSwFTFieldPerPop LemasTotalReq LemasTotReqPerPop PolicReqPerOffic PolicPerPop RacialMatchCommPol PctPolicWhite PctPolicBlack PctPolicHisp PctPolicAsian PctPolicMinor OfficAssgnDrugUnits NumKindsDrugsSeiz PolicAveOTWorked PolicCars PolicOperBudg LemasPctPolicOnPatr LemasGangUnitDeploy PolicBudgPerPop rapes rapesPerPop robberies robbbPerPop assaults assaultPerPop burglaries burglPerPop larcenies larcPerPop autoTheft autoTheftPerPop arsons arsonsPerPop ViolentCrimesPerPop nonViolPerPop
0 39.0 5320.0 5115.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 0.00 1.0 8.20 4.0 32.81 14.0 114.85 138.0 1132.08 16.0 131.26 2.0 16.41 41.02 1394.59
1 45.0 47616.0 5250.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 4.25 5.0 21.26 24.0 102.05 57.0 242.37 376.0 1598.78 26.0 110.55 1.0 4.25 127.56 1955.95
2 NaN NaN 5954.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6.0 16.60 56.0 154.95 14.0 38.74 274.0 758.14 1797.0 4972.19 136.0 376.30 22.0 60.87 218.59 6167.51
3 35.0 29443.0 2451.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 10.0 57.86 10.0 57.86 33.0 190.93 225.0 1301.78 716.0 4142.56 47.0 271.93 NaN NaN 306.64 NaN
4 7.0 5068.0 3000.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 4.0 32.04 14.0 112.14 91.0 728.93 1060.0 8490.87 91.0 728.93 5.0 40.05 NaN 9988.79
In [ ]:
#display sums of missing values of only columns that contain missing values

null_list= null_cols.columns.values
null_cols.isnull().sum()
In [ ]:
#change settings to display all columns
pd.set_option('display.max_columns', None)

#get summary statistics of all columns
crime_data.describe()
Out[ ]:
countyCode communityCode fold population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct12t29 agePct16t24 agePct65up numbUrban pctUrban medIncome pctWWage pctWFarmSelf pctWInvInc pctWSocSec pctWPubAsst pctWRetire medFamInc perCapInc whitePerCap blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap NumUnderPov PctPopUnderPov PctLess9thGrade PctNotHSGrad PctBSorMore PctUnemployed PctEmploy PctEmplManu PctEmplProfServ PctOccupManu PctOccupMgmtProf MalePctDivorce MalePctNevMarr FemalePctDiv TotalPctDiv PersPerFam PctFam2Par PctKids2Par PctYoungKids2Par PctTeen2Par PctWorkMomYoungKids PctWorkMom NumKidsBornNeverMar PctKidsBornNeverMar NumImmig PctImmigRecent PctImmigRec5 PctImmigRec8 PctImmigRec10 PctRecentImmig PctRecImmig5 PctRecImmig8 PctRecImmig10 PctSpeakEnglOnly PctNotSpeakEnglWell PctLargHouseFam PctLargHouseOccup PersPerOccupHous PersPerOwnOccHous PersPerRentOccHous PctPersOwnOccup PctPersDenseHous PctHousLess3BR MedNumBR HousVacant PctHousOccup PctHousOwnOcc PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctHousNoPhone PctWOFullPlumb OwnOccLowQuart OwnOccMedVal OwnOccHiQuart OwnOccQrange RentLowQ RentMedian RentHighQ RentQrange MedRent MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg NumInShelters NumStreet PctForeignBorn PctBornSameState PctSameHouse85 PctSameCity85 PctSameState85 LemasSwornFT LemasSwFTPerPop LemasSwFTFieldOps LemasSwFTFieldPerPop LemasTotalReq LemasTotReqPerPop PolicReqPerOffic PolicPerPop RacialMatchCommPol PctPolicWhite PctPolicBlack PctPolicHisp PctPolicAsian PctPolicMinor OfficAssgnDrugUnits NumKindsDrugsSeiz PolicAveOTWorked LandArea PopDens PctUsePubTrans PolicCars PolicOperBudg LemasPctPolicOnPatr LemasGangUnitDeploy LemasPctOfficDrugUn PolicBudgPerPop murders murdPerPop rapes rapesPerPop robberies robbbPerPop assaults assaultPerPop burglaries burglPerPop larcenies larcPerPop autoTheft autoTheftPerPop arsons arsonsPerPop ViolentCrimesPerPop nonViolPerPop
count 994.000000 991.000000 2215.000000 2.215000e+03 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2.215000e+03 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2214.000000 2215.000000 2.215000e+03 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2.215000e+03 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 343.000000 343.000000 343.000000 343.000000 3.430000e+02 3.430000e+02 343.000000 343.000000 343.000000 343.000000 343.000000 343.000000 343.000000 343.000000 343.000000 343.000000 343.000000 2215.000000 2215.000000 2215.000000 343.000000 3.430000e+02 343.000000 343.000000 2215.000000 3.430000e+02 2215.000000 2215.000000 2007.000000 2007.00000 2214.000000 2214.000000 2202.000000 2202.000000 2212.000000 2212.000000 2212.000000 2212.000000 2212.000000 2212.000000 2124.000000 2124.000000 1994.000000 2118.000000
mean 65.587525 45209.251261 5.494357 5.311798e+04 2.707327 9.335102 83.979819 2.670203 7.950176 14.445837 27.644840 13.975142 11.836393 4.773472e+04 70.465309 33984.696163 78.312758 0.881842 43.750935 26.409418 6.801445 15.969002 39857.055079 15603.524605 16567.698420 11541.749436 12229.191422 14227.989616 9442.765131 11018.998194 7.590853e+03 11.620537 9.186646 22.305120 23.056876 6.045242 62.021612 18.228907 24.532298 13.819165 28.209201 9.127585 30.683517 12.325300 10.812515 3.129698 74.059129 71.227255 81.865422 75.521788 60.542641 68.854795 2141.418962 3.115499 6.277274e+03 13.525693 20.421287 27.544181 34.733928 1.099124 1.697463 2.307503 2.943761 87.074993 2.405792 5.386619 3.915788 2.615842 2.740483 2.367138 66.369454 4.132438 45.405341 2.640632 1748.368849 92.933973 63.368298 2.778524 34.773887 1962.623476 4.289824 0.425273 88695.802257 113097.523251 145318.257788 56622.455530 329.966591 428.537246 527.252822 197.286230 501.466366 26.298104 20.990158 13.010203 66.953499 17.823476 7.340302 61.539630 51.538596 77.411079 88.111865 499.198251 246.490962 432.559767 210.844781 2.524050e+05 1.206517e+05 523.658309 246.493586 85.499679 82.515831 9.263294 5.459767 0.681283 15.242245 26.288630 8.816327 119.114286 27.419955 2783.835034 3.041124 185.478134 3.217602e+07 87.130933 4.285714 0.980163 1.535779e+05 7.764786 5.859296 28.046338 36.25848 237.952123 162.612597 326.528156 378.004605 761.236890 1033.430203 2137.629295 3372.979150 516.692586 473.965628 30.907721 32.153682 589.078922 4908.241804
std 117.831399 25425.861573 2.872924 2.046203e+05 0.334120 14.247156 16.419080 4.473843 14.589832 4.518623 6.181517 5.970747 4.777565 2.056067e+05 44.080275 13424.680011 7.950672 0.689006 12.787925 8.295604 4.700335 4.622553 14251.206032 6281.558523 6346.840251 9232.102062 14853.836177 9881.266395 7926.466713 5884.063446 3.936146e+04 8.600352 6.666703 10.989517 12.687213 2.895618 8.312045 8.099281 6.659470 6.430264 9.326123 2.802747 8.127991 3.262613 3.000883 0.240743 10.525952 12.045048 12.263736 10.365262 8.008937 6.679960 14692.582838 3.127681 5.541965e+04 9.780098 12.410355 14.368813 16.327322 1.595766 2.461060 3.286648 4.246468 14.076087 4.210368 3.794309 3.175770 0.315646 0.297421 0.391806 14.182588 5.599131 13.778347 0.512686 6503.866478 5.040736 13.970057 3.592396 13.911468 11.166555 4.088175 0.426188 66670.781534 81906.362277 99030.913816 39106.498041 144.138461 170.706644 199.290780 85.205688 169.271735 2.979297 2.987622 1.419679 564.253149 245.452553 8.418476 16.750061 10.517926 10.878186 7.287836 1681.472251 273.799162 1493.708385 235.478815 6.894498e+05 1.482113e+05 307.839007 273.798409 10.941312 15.332612 11.021424 10.604533 1.706344 14.826756 100.821921 2.836391 92.495186 109.822600 2828.993341 4.912917 318.542834 1.104566e+08 10.349612 4.064538 2.877128 2.030409e+05 58.166468 9.156829 105.616135 34.23975 2250.720788 234.486624 1987.947941 438.238599 3111.702756 763.354442 7600.573464 1901.316145 3258.164244 504.666026 180.125248 39.240900 614.784518 2739.708901
min 1.000000 70.000000 1.000000 1.000500e+04 1.600000 0.000000 2.680000 0.030000 0.120000 4.580000 9.380000 4.640000 1.660000 0.000000e+00 0.000000 8866.000000 31.680000 0.000000 5.810000 4.810000 0.180000 3.460000 10447.000000 5237.000000 5472.000000 0.000000 0.000000 0.000000 0.000000 0.000000 7.800000e+01 0.640000 0.200000 1.460000 1.630000 1.320000 24.820000 2.050000 8.690000 1.370000 6.480000 2.130000 12.060000 3.350000 2.830000 2.290000 22.970000 18.300000 8.700000 20.200000 24.420000 41.950000 0.000000 0.000000 2.000000e+01 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 6.150000 0.000000 0.960000 0.440000 1.580000 1.610000 1.550000 13.930000 0.050000 3.060000 1.000000 36.000000 37.470000 16.860000 0.000000 3.120000 1939.000000 0.000000 0.000000 14999.000000 19500.000000 28200.000000 0.000000 99.000000 120.000000 182.000000 0.000000 192.000000 14.900000 14.000000 10.100000 0.000000 0.000000 0.180000 6.750000 11.830000 27.950000 32.830000 65.000000 29.400000 14.000000 19.210000 2.100000e+03 2.704800e+03 20.800000 29.400000 42.150000 1.600000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.900000 10.000000 0.000000 20.000000 2.380215e+06 10.850000 0.000000 0.000000 1.526040e+04 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000 0.000000 0.000000 2.000000 16.920000 10.000000 77.860000 1.000000 6.550000 0.000000 0.000000 0.000000 116.790000
25% 11.000000 22887.000000 3.000000 1.436600e+04 2.500000 0.860000 76.320000 0.620000 0.930000 12.250000 24.415000 11.320000 8.750000 0.000000e+00 0.000000 23817.000000 73.400000 0.460000 34.680000 20.770000 3.270000 12.875000 29538.000000 11602.500000 12610.500000 6742.500000 6345.000000 8285.500000 5528.250000 7274.000000 9.125000e+02 4.510000 4.640000 13.920000 14.095000 4.045000 56.490000 12.215000 20.075000 9.130000 21.910000 7.110000 25.450000 9.860000 8.575000 2.990000 67.900000 63.990000 74.780000 70.170000 55.430000 64.900000 147.000000 1.070000 4.000000e+02 6.695000 11.255000 17.205000 22.725000 0.170000 0.280000 0.390000 0.520000 84.380000 0.510000 3.390000 2.370000 2.410000 2.550000 2.110000 57.285000 1.290000 37.505000 2.000000 304.500000 91.290000 54.820000 0.720000 24.480000 1956.000000 0.905000 0.160000 41500.000000 56200.000000 74300.000000 32200.000000 213.500000 289.500000 366.000000 139.000000 364.000000 24.300000 18.700000 12.000000 0.000000 0.000000 2.060000 50.110000 44.995000 72.060000 85.200000 131.000000 149.115000 114.000000 132.245000 4.986450e+04 6.484505e+04 343.350000 149.150000 79.435000 76.230000 2.020000 0.415000 0.000000 4.950000 6.000000 7.000000 55.100000 7.300000 1181.900000 0.360000 54.000000 7.275060e+06 84.295000 0.000000 0.000000 8.809435e+04 0.000000 0.000000 2.000000 11.53500 5.000000 27.647500 18.000000 94.187500 95.000000 511.690000 392.000000 2040.080000 30.000000 156.952500 1.000000 7.670000 161.700000 2918.070000
50% 27.000000 46925.000000 5.000000 2.279200e+04 2.660000 2.870000 90.350000 1.230000 2.180000 13.620000 26.780000 12.540000 11.730000 1.804100e+04 100.000000 31441.000000 78.610000 0.690000 42.880000 26.590000 5.610000 15.650000 36678.000000 14101.000000 15073.000000 9777.000000 9895.000000 12250.000000 8186.000000 9721.000000 2.142000e+03 9.330000 7.740000 21.380000 19.650000 5.450000 62.440000 17.300000 23.390000 13.150000 26.240000 9.150000 29.000000 12.520000 10.900000 3.100000 75.030000 72.530000 83.990000 76.920000 60.710000 69.230000 352.000000 2.040000 1.024000e+03 12.260000 19.080000 26.720000 34.790000 0.500000 0.750000 1.040000 1.310000 92.180000 0.920000 4.280000 3.050000 2.570000 2.710000 2.290000 65.910000 2.340000 46.390000 3.000000 558.000000 94.210000 62.830000 1.660000 34.100000 1964.000000 2.850000 0.320000 65500.000000 82800.000000 106700.000000 43400.000000 307.000000 397.000000 486.000000 171.000000 467.000000 26.100000 21.000000 12.800000 0.000000 0.000000 4.310000 64.490000 52.170000 79.490000 90.030000 173.000000 196.010000 152.000000 170.270000 9.000000e+04 9.103460e+04 443.200000 196.000000 87.930000 86.180000 5.000000 2.040000 0.000000 11.370000 12.000000 9.000000 98.700000 13.700000 2027.300000 1.220000 86.000000 1.116411e+07 89.580000 5.000000 0.000000 1.145820e+05 1.000000 2.170000 7.000000 26.92000 19.000000 74.800000 56.000000 226.525000 205.000000 822.715000 747.000000 3079.510000 75.000000 302.355000 5.000000 21.080000 374.060000 4425.450000
75% 80.500000 65805.000000 8.000000 4.302400e+04 2.850000 11.145000 96.225000 2.670000 7.810000 15.360000 29.205000 14.345000 14.415000 4.191800e+04 100.000000 41480.500000 84.030000 1.100000 52.740000 31.715000 9.105000 18.755000 46999.000000 17795.000000 18609.500000 14526.000000 14757.500000 17327.500000 11525.500000 13418.000000 4.988000e+03 16.905000 11.835000 29.195000 29.055000 7.440000 67.825000 23.400000 27.590000 17.665000 32.815000 11.050000 33.410000 14.745000 12.985000 3.220000 81.900000 80.395000 91.675000 82.765000 65.985000 73.495000 1031.500000 3.910000 3.302000e+03 17.950000 27.445000 36.495000 46.185000 1.310000 2.015000 2.700000 3.455000 95.455000 2.270000 5.870000 4.210000 2.770000 2.900000 2.530000 76.580000 4.730000 53.515000 3.000000 1228.000000 96.020000 72.645000 3.430000 43.970000 1971.000000 6.805000 0.555000 121500.000000 150600.000000 188000.000000 65450.000000 421.000000 544.000000 659.500000 232.500000 615.000000 28.000000 23.100000 13.700000 22.000000 1.000000 9.250000 74.855000 58.740000 85.135000 93.010000 314.000000 260.650000 285.500000 226.815000 1.719235e+05 1.303246e+05 637.250000 260.650000 93.645000 93.340000 14.065000 6.215000 0.650000 19.740000 23.000000 10.500000 153.550000 26.100000 3321.700000 3.365000 189.500000 2.014754e+07 93.200000 10.000000 0.000000 1.556557e+05 3.000000 8.365000 19.000000 51.47000 70.000000 187.155000 180.000000 504.387500 508.000000 1350.232500 1675.000000 4335.410000 232.500000 589.775000 16.000000 42.852500 794.400000 6229.280000
max 840.000000 94597.000000 10.000000 7.322564e+06 5.280000 96.670000 99.630000 57.460000 95.290000 54.400000 70.510000 63.620000 52.770000 7.322564e+06 100.000000 123625.000000 96.760000 6.530000 89.040000 76.390000 44.820000 45.510000 139008.000000 63302.000000 68850.000000 212120.000000 480000.000000 106165.000000 137000.000000 54648.000000 1.384994e+06 58.000000 49.890000 73.660000 79.180000 31.230000 84.670000 50.030000 62.670000 44.270000 64.970000 20.080000 76.600000 23.920000 22.230000 4.640000 93.600000 92.580000 100.000000 97.340000 87.970000 89.370000 527557.000000 27.350000 2.082931e+06 64.290000 76.160000 80.810000 88.000000 13.710000 19.930000 25.340000 32.630000 98.980000 38.330000 34.870000 30.870000 4.520000 4.480000 4.730000 97.240000 59.490000 95.340000 4.000000 172768.000000 99.000000 96.490000 39.890000 82.130000 1987.000000 23.880000 5.330000 500001.000000 500001.000000 500001.000000 331000.000000 1001.000000 1001.000000 1001.000000 803.000000 1001.000000 35.100000 32.700000 23.400000 23383.000000 10447.000000 60.400000 93.140000 78.560000 96.590000 99.900000 25655.000000 3437.230000 22496.000000 3290.620000 8.328470e+06 1.926282e+06 2162.500000 3437.200000 100.000000 100.000000 67.310000 98.400000 18.570000 98.400000 1773.000000 15.000000 634.700000 3569.800000 44229.900000 54.330000 3187.000000 1.617293e+09 99.940000 10.000000 48.440000 2.422367e+06 1946.000000 91.090000 2818.000000 401.35000 86001.000000 2264.130000 62778.000000 4932.500000 99207.000000 11881.020000 235132.000000 25910.550000 112464.000000 4968.590000 5119.000000 436.370000 4877.060000 27119.760000
In [ ]:
crime_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2215 entries, 0 to 2214
Columns: 147 entries, communityname to nonViolPerPop
dtypes: float64(116), int64(29), object(2)
memory usage: 2.5+ MB
In [ ]:
#find the mean of each column by state
mean_data = crime_data.groupby("state").mean()
mean_data.head()
C:\Users\radon\AppData\Local\Temp\ipykernel_10788\3113613801.py:2: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  mean_data = crime_data.groupby("state").mean()
Out[ ]:
countyCode communityCode fold population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct12t29 agePct16t24 agePct65up numbUrban pctUrban medIncome pctWWage pctWFarmSelf pctWInvInc pctWSocSec pctWPubAsst pctWRetire medFamInc perCapInc whitePerCap blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap NumUnderPov PctPopUnderPov PctLess9thGrade PctNotHSGrad PctBSorMore PctUnemployed PctEmploy PctEmplManu PctEmplProfServ PctOccupManu PctOccupMgmtProf MalePctDivorce MalePctNevMarr FemalePctDiv TotalPctDiv PersPerFam PctFam2Par PctKids2Par PctYoungKids2Par PctTeen2Par PctWorkMomYoungKids PctWorkMom NumKidsBornNeverMar PctKidsBornNeverMar NumImmig PctImmigRecent PctImmigRec5 PctImmigRec8 PctImmigRec10 PctRecentImmig PctRecImmig5 PctRecImmig8 PctRecImmig10 PctSpeakEnglOnly PctNotSpeakEnglWell PctLargHouseFam PctLargHouseOccup PersPerOccupHous PersPerOwnOccHous PersPerRentOccHous PctPersOwnOccup PctPersDenseHous PctHousLess3BR MedNumBR HousVacant PctHousOccup PctHousOwnOcc PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctHousNoPhone PctWOFullPlumb OwnOccLowQuart OwnOccMedVal OwnOccHiQuart OwnOccQrange RentLowQ RentMedian RentHighQ RentQrange MedRent MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg NumInShelters NumStreet PctForeignBorn PctBornSameState PctSameHouse85 PctSameCity85 PctSameState85 LemasSwornFT LemasSwFTPerPop LemasSwFTFieldOps LemasSwFTFieldPerPop LemasTotalReq LemasTotReqPerPop PolicReqPerOffic PolicPerPop RacialMatchCommPol PctPolicWhite PctPolicBlack PctPolicHisp PctPolicAsian PctPolicMinor OfficAssgnDrugUnits NumKindsDrugsSeiz PolicAveOTWorked LandArea PopDens PctUsePubTrans PolicCars PolicOperBudg LemasPctPolicOnPatr LemasGangUnitDeploy LemasPctOfficDrugUn PolicBudgPerPop murders murdPerPop rapes rapesPerPop robberies robbbPerPop assaults assaultPerPop burglaries burglPerPop larcenies larcPerPop autoTheft autoTheftPerPop arsons arsonsPerPop ViolentCrimesPerPop nonViolPerPop
state
AK NaN NaN 7.333333 94644.000000 2.756667 6.826667 77.906667 4.136667 4.086667 13.986667 29.586667 13.966667 4.303333 73915.000000 32.656667 41301.000000 89.586667 1.083333 74.923333 10.960000 6.843333 13.766667 46051.333333 18068.333333 19928.333333 12596.666667 10115.333333 12005.333333 12153.000000 11534.666667 6681.666667 7.683333 3.480000 11.176667 25.300000 6.610000 72.913333 3.343333 24.726667 8.743333 31.840000 12.896667 31.660000 16.116667 14.440000 3.196667 74.700000 73.003333 81.053333 78.000000 63.173333 72.633333 1865.333333 3.133333 5397.333333 12.216667 20.800000 32.660000 40.870000 0.653333 1.106667 1.730000 2.153333 90.686667 1.236667 5.390000 3.676667 2.653333 2.783333 2.503333 49.940000 5.586667 53.32000 2.333333 4613.000000 89.246667 47.370000 5.843333 39.796667 1973.666667 5.483333 1.216667 78433.333333 102866.666667 132533.333333 54100.000000 407.000000 527.333333 673.666667 266.666667 580.000000 24.900000 21.500000 11.300000 130.000000 25.000000 5.243333 28.786667 35.826667 64.576667 72.073333 266.000000 104.930000 252.000000 99.410000 107811.000000 42529.000000 405.300000 104.900000 89.390000 89.470000 4.140000 1.880000 2.630000 8.650000 16.000000 11.000000 0.000000 1492.733333 363.033333 2.916667 301.000000 2.941312e+07 94.740000 10.000000 2.006667 116028.100000 9.666667 8.870000 83.000000 66.080000 217.000000 156.103333 523.666667 345.516667 753.000000 639.463333 4325.666667 3813.090000 562.333333 485.286667 38.000000 23.833333 576.576667 4961.673333
AL NaN NaN 6.000000 39231.186047 2.611860 27.044651 71.799535 0.750000 0.665116 16.117209 28.540233 15.227209 12.415581 34182.069767 64.992558 26034.860465 75.144419 0.855814 33.825349 28.445814 8.100930 15.898837 33033.232558 13065.651163 14933.186047 7357.930233 12009.325581 12114.069767 6510.232558 10418.418605 7405.232558 17.863256 11.406977 27.426279 23.013023 6.635581 57.156047 19.573953 26.390698 16.170233 28.505814 9.366047 29.517907 13.030698 11.367674 3.075116 69.230698 64.306744 76.904186 70.583023 63.534884 69.728837 1965.325581 4.789070 687.255814 18.807209 28.404186 33.491860 42.968140 0.294884 0.434419 0.531163 0.648837 96.534651 0.424186 4.433023 3.093023 2.520000 2.629767 2.335349 65.245581 2.924186 41.71000 2.860465 1366.395349 92.252093 62.581628 2.883953 37.815349 1966.976744 6.644186 0.640930 46648.837209 63955.813953 89079.069767 42430.232558 170.465116 246.697674 325.372093 154.906977 336.627907 25.218605 18.051163 12.688372 32.093023 7.883721 1.483488 71.264186 51.849767 78.885349 87.704419 293.833333 184.121667 252.500000 155.971667 141162.166667 87517.881667 431.033333 184.150000 81.233333 75.255000 23.325000 0.425000 0.720000 24.471667 14.333333 8.500000 164.500000 37.248837 1143.448837 0.523023 167.166667 1.356148e+07 81.938333 5.833333 0.753953 82293.251667 7.581395 11.342093 24.325581 42.825349 135.720930 199.333721 374.093023 777.197674 708.418605 1273.725814 1869.255814 3730.038605 260.720930 399.103953 30.812500 35.178750 1030.699070 6025.023750
AR NaN NaN 5.440000 32674.520000 2.652800 19.470000 78.883600 0.835600 0.979200 16.207600 29.028800 15.354400 12.666400 20450.480000 40.000000 22025.560000 73.618400 1.258000 32.592800 29.241200 8.465200 14.075200 27477.320000 11245.760000 12798.640000 7215.760000 8606.000000 10218.360000 9461.120000 7651.240000 5535.720000 18.780800 12.641600 29.064400 16.950800 6.596000 58.104000 19.264000 24.798400 17.750800 23.689200 10.165600 27.406400 13.674000 12.065200 3.058000 69.728400 65.208800 76.107600 71.822800 64.511200 71.568800 1191.000000 3.995200 547.760000 19.377200 25.513200 37.754800 43.952000 0.251600 0.360400 0.540400 0.656000 96.800800 0.546000 4.402000 3.130800 2.522800 2.589200 2.421600 62.018800 3.427600 45.65840 2.880000 1280.440000 91.835600 60.400000 3.585200 37.975200 1968.120000 9.751200 0.581600 37220.000000 51488.000000 71760.000000 34540.000000 176.960000 245.360000 313.600000 136.640000 339.800000 26.880000 19.428000 13.452000 15.200000 1.440000 1.369200 63.527600 48.319200 75.919200 85.816000 111.666667 147.090000 88.000000 117.630000 51710.000000 73882.933333 447.833333 147.100000 76.176667 85.573333 14.156667 0.000000 0.270000 14.426667 9.333333 7.333333 42.933333 26.832000 1241.868000 0.415200 70.666667 5.969266e+06 78.543333 3.333333 0.979600 67721.166667 5.680000 10.735600 25.440000 52.083600 102.600000 195.645600 290.560000 460.085200 652.680000 1450.826400 1856.600000 4575.336800 222.760000 450.897200 15.760000 32.514800 718.550400 6509.575600
AZ NaN NaN 5.700000 125811.650000 2.778500 2.610500 84.655500 1.574500 22.434500 14.850000 27.722500 13.808500 11.690500 114430.000000 54.660000 32264.500000 77.323000 0.952000 38.545500 25.294000 6.444000 16.827500 37024.700000 15380.050000 16208.950000 14505.700000 10459.250000 12970.100000 7450.900000 9127.850000 17101.050000 15.036500 9.917500 21.542500 21.650500 7.012000 59.707500 12.751500 22.172500 11.422000 27.247000 9.869000 28.504500 13.229000 11.610000 3.175000 74.329500 70.758500 82.037000 76.480000 58.645500 66.129000 3050.750000 2.618000 10154.800000 14.418000 23.323000 30.726000 37.103500 1.459000 2.286000 2.925500 3.545000 78.144000 4.111000 7.035500 5.244500 2.706500 2.793000 2.571500 65.994500 6.886500 49.96900 2.550000 7948.600000 84.915000 64.188000 1.902000 27.634500 1975.900000 6.852500 0.329000 72980.000000 95985.000000 128820.050000 55840.050000 311.200000 405.650000 499.500000 188.300000 476.700000 28.175000 22.770000 12.635000 125.350000 30.350000 9.245500 29.936000 39.551000 68.547000 74.233000 611.833333 139.425000 575.000000 130.758333 512103.833333 116322.850000 849.000000 139.416667 96.275000 89.226667 1.680000 8.370000 0.415000 10.463333 21.500000 11.000000 161.200000 78.850000 1375.160000 1.103000 366.833333 4.265164e+07 93.966667 6.666667 1.469500 98273.283333 12.400000 4.613500 58.600000 30.195000 292.500000 119.256000 810.000000 397.030000 2281.450000 1510.476500 6869.450000 4325.365000 1501.850000 661.164500 45.100000 35.179000 551.094500 6532.186000
CA 76.0 93325.0 5.591398 79654.179211 2.926129 5.312222 71.973799 9.480896 24.799427 13.974158 28.252581 13.678136 9.780287 76204.000000 85.266989 39655.982079 80.120394 0.827849 40.754624 21.825520 8.877240 14.881900 44720.125448 17465.885305 18957.817204 13960.465950 14410.684588 15125.035842 10407.967742 11528.028674 9958.157706 10.789140 10.443297 22.795986 23.964158 6.350860 63.222581 16.795376 22.322186 12.847097 28.926344 9.677957 32.928100 14.002688 11.870323 3.304301 74.187312 70.442652 83.169928 75.711541 56.891039 64.844875 2630.627240 3.391505 19501.440860 15.708638 24.847025 34.214588 43.798459 3.295878 5.196667 7.149247 9.209498 70.336953 7.405412 9.233799 7.007849 2.851111 2.885735 2.788244 59.075663 11.628315 51.37086 2.448029 1725.663082 94.359534 58.164552 1.918746 20.730000 1967.225806 2.712939 0.429749 175336.935484 219367.068100 269462.820789 94125.885305 490.584229 607.885305 732.716846 242.132616 665.078853 28.691756 24.844086 11.708602 100.139785 49.885305 19.687204 48.018280 44.247742 74.198208 87.774265 458.061224 157.247959 386.244898 134.820204 311710.979592 103479.571020 694.083673 157.248980 82.908571 74.932041 7.714694 10.421837 3.077551 20.878571 24.877551 9.591837 181.189796 33.169534 4693.495699 3.265878 173.734694 3.961899e+07 87.499388 6.020408 1.124695 154858.814490 11.727599 8.134588 32.491039 32.646882 406.215054 273.137276 535.151079 496.704964 1098.870968 1248.996595 2694.913978 3022.680896 965.017921 819.774265 55.698925 47.988530 810.469388 5139.440251
In [ ]:
#find the mean of crime per pop columns by state
state_crimes =crime_data.groupby("state")["murdPerPop", "rapesPerPop", "robbbPerPop", "assaultPerPop", "burglPerPop", "larcPerPop", "autoTheftPerPop", "arsonsPerPop"].mean()
state_crimes.head(10)
C:\Users\radon\AppData\Local\Temp\ipykernel_10788\76998245.py:2: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
  state_crimes =crime_data.groupby("state")["murdPerPop", "rapesPerPop", "robbbPerPop", "assaultPerPop", "burglPerPop", "larcPerPop", "autoTheftPerPop", "arsonsPerPop"].mean()
Out[ ]:
murdPerPop rapesPerPop robbbPerPop assaultPerPop burglPerPop larcPerPop autoTheftPerPop arsonsPerPop
state
AK 8.870000 66.080000 156.103333 345.516667 639.463333 3813.090000 485.286667 23.833333
AL 11.342093 42.825349 199.333721 777.197674 1273.725814 3730.038605 399.103953 35.178750
AR 10.735600 52.083600 195.645600 460.085200 1450.826400 4575.336800 450.897200 32.514800
AZ 4.613500 30.195000 119.256000 397.030000 1510.476500 4325.365000 661.164500 35.179000
CA 8.134588 32.646882 273.137276 496.704964 1248.996595 3022.680896 819.774265 47.988530
CO 3.822800 41.320400 84.554000 397.163600 965.632800 4010.375600 350.701200 42.820400
CT 3.841972 16.033239 111.564366 127.755217 740.837746 2372.257465 425.045070 19.466338
DC 81.950000 58.480000 1282.850000 1625.090000 2081.590000 5679.780000 1454.870000 36.100000
DE 0.000000 123.330000 267.210000 496.750000 859.880000 6118.530000 260.360000 37.680000
FL 7.305556 49.775889 359.934778 742.029556 1932.676667 5190.722333 866.247889 21.462111
In [ ]:
#find the states with min and max values for crimes
state_crimes["murdPerPop"].sort_values()
state_crimes["rapesPerPop"].sort_values()
state_crimes["robbbPerPop"].sort_values()
state_crimes["assaultPerPop"].sort_values()
state_crimes["burglPerPop"].sort_values()
state_crimes["larcPerPop"].sort_values()
state_crimes["autoTheftPerPop"].sort_values()
state_crimes["arsonsPerPop"].sort_values()
In [ ]:
#find representation of each state in dataset, how many rows does each state have?
crime_data.groupby(['state'])['state'].count().sort_values()
In [ ]:
#Examine the STATE with the overall max values of each crime
crime_data.loc[crime_data['state'] == 'DC']
Out[ ]:
communityname state countyCode communityCode fold population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct12t29 agePct16t24 agePct65up numbUrban pctUrban medIncome pctWWage pctWFarmSelf pctWInvInc pctWSocSec pctWPubAsst pctWRetire medFamInc perCapInc whitePerCap blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap NumUnderPov PctPopUnderPov PctLess9thGrade PctNotHSGrad PctBSorMore PctUnemployed PctEmploy PctEmplManu PctEmplProfServ PctOccupManu PctOccupMgmtProf MalePctDivorce MalePctNevMarr FemalePctDiv TotalPctDiv PersPerFam PctFam2Par PctKids2Par PctYoungKids2Par PctTeen2Par PctWorkMomYoungKids PctWorkMom NumKidsBornNeverMar PctKidsBornNeverMar NumImmig PctImmigRecent PctImmigRec5 PctImmigRec8 PctImmigRec10 PctRecentImmig PctRecImmig5 PctRecImmig8 PctRecImmig10 PctSpeakEnglOnly PctNotSpeakEnglWell PctLargHouseFam PctLargHouseOccup PersPerOccupHous PersPerOwnOccHous PersPerRentOccHous PctPersOwnOccup PctPersDenseHous PctHousLess3BR MedNumBR HousVacant PctHousOccup PctHousOwnOcc PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctHousNoPhone PctWOFullPlumb OwnOccLowQuart OwnOccMedVal OwnOccHiQuart OwnOccQrange RentLowQ RentMedian RentHighQ RentQrange MedRent MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg NumInShelters NumStreet PctForeignBorn PctBornSameState PctSameHouse85 PctSameCity85 PctSameState85 LemasSwornFT LemasSwFTPerPop LemasSwFTFieldOps LemasSwFTFieldPerPop LemasTotalReq LemasTotReqPerPop PolicReqPerOffic PolicPerPop RacialMatchCommPol PctPolicWhite PctPolicBlack PctPolicHisp PctPolicAsian PctPolicMinor OfficAssgnDrugUnits NumKindsDrugsSeiz PolicAveOTWorked LandArea PopDens PctUsePubTrans PolicCars PolicOperBudg LemasPctPolicOnPatr LemasGangUnitDeploy LemasPctOfficDrugUn PolicBudgPerPop murders murdPerPop rapes rapesPerPop robberies robbbPerPop assaults assaultPerPop burglaries burglPerPop larcenies larcPerPop autoTheft autoTheftPerPop arsons arsonsPerPop ViolentCrimesPerPop nonViolPerPop
1581 Washingtoncity DC 1.0 50000.0 8 606900 2.43 65.84 29.6 1.85 5.39 13.54 29.83 15.69 11.53 606900 100.0 30727 78.16 0.5 34.26 20.22 8.94 17.44 36256 18881 34563 12226 14095 16498 10468.0 12525 96278 16.87 9.56 26.85 33.31 7.03 61.62 4.29 33.61 7.37 39.1 13.67 50.76 16.18 15.03 3.15 41.96 33.58 52.23 44.32 66.58 72.92 70523 18.2 58887 23.98 36.96 48.35 57.55 2.33 3.59 4.69 5.58 87.49 2.55 8.96 4.6 2.26 2.5 2.12 42.91 8.25 67.33 2 28855 89.64 38.9 17.4 44.05 1947 4.22 0.81 86700 123900 258700 172000 317 441 618 301 479 25.4 20.5 12.8 4682 131 9.7 39.34 53.49 76.61 76.61 4506.0 813.36 4066.0 733.94 871531.0 157316.09 193.4 813.4 97.32 32.2 64.4 2.57 0.75 65.42 198.0 13.0 197.9 63.6 9538.9 37.3 590.0 208184992.0 90.24 0.0 4.39 375785.19 454 81.95 324.0 58.48 7107.0 1282.85 9003.0 1625.09 11532.0 2081.59 31466.0 5679.78 8060.0 1454.87 200.0 36.1 3048.38 9252.35
In [ ]:
#Sort the dataset by population
crime_data.sort_values(['population'])
In [ ]:
#Subset the states with the most representation in the dataset
max_rep = ['CA', 'NJ', 'TX']
 
maxrep_subset = crime_data[crime_data.state.isin(max_rep)]

maxrep_subset.describe()
Out[ ]:
countyCode communityCode fold population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct12t29 agePct16t24 agePct65up numbUrban pctUrban medIncome pctWWage pctWFarmSelf pctWInvInc pctWSocSec pctWPubAsst pctWRetire medFamInc perCapInc whitePerCap blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap NumUnderPov PctPopUnderPov PctLess9thGrade PctNotHSGrad PctBSorMore PctUnemployed PctEmploy PctEmplManu PctEmplProfServ PctOccupManu PctOccupMgmtProf MalePctDivorce MalePctNevMarr FemalePctDiv TotalPctDiv PersPerFam PctFam2Par PctKids2Par PctYoungKids2Par PctTeen2Par PctWorkMomYoungKids PctWorkMom NumKidsBornNeverMar PctKidsBornNeverMar NumImmig PctImmigRecent PctImmigRec5 PctImmigRec8 PctImmigRec10 PctRecentImmig PctRecImmig5 PctRecImmig8 PctRecImmig10 PctSpeakEnglOnly PctNotSpeakEnglWell PctLargHouseFam PctLargHouseOccup PersPerOccupHous PersPerOwnOccHous PersPerRentOccHous PctPersOwnOccup PctPersDenseHous PctHousLess3BR MedNumBR HousVacant PctHousOccup PctHousOwnOcc PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctHousNoPhone PctWOFullPlumb OwnOccLowQuart OwnOccMedVal OwnOccHiQuart OwnOccQrange RentLowQ RentMedian RentHighQ RentQrange MedRent MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg NumInShelters NumStreet PctForeignBorn PctBornSameState PctSameHouse85 PctSameCity85 PctSameState85 LemasSwornFT LemasSwFTPerPop LemasSwFTFieldOps LemasSwFTFieldPerPop LemasTotalReq LemasTotReqPerPop PolicReqPerOffic PolicPerPop RacialMatchCommPol PctPolicWhite PctPolicBlack PctPolicHisp PctPolicAsian PctPolicMinor OfficAssgnDrugUnits NumKindsDrugsSeiz PolicAveOTWorked LandArea PopDens PctUsePubTrans PolicCars PolicOperBudg LemasPctPolicOnPatr LemasGangUnitDeploy LemasPctOfficDrugUn PolicBudgPerPop murders murdPerPop rapes rapesPerPop robberies robbbPerPop assaults assaultPerPop burglaries burglPerPop larcenies larcPerPop autoTheft autoTheftPerPop arsons arsonsPerPop ViolentCrimesPerPop nonViolPerPop
count 213.000000 213.000000 652.000000 6.520000e+02 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 6.520000e+02 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 6.520000e+02 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.00000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.00000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 652.000000 110.000000 110.000000 110.000000 110.000000 1.100000e+02 110.000000 110.000000 110.000000 110.000000 110.000000 110.000000 110.000000 110.000000 110.000000 110.000000 110.000000 110.000000 652.000000 652.000000 652.000000 110.000000 1.100000e+02 110.000000 110.000000 652.000000 110.000000 652.000000 652.000000 651.000000 651.000000 651.000000 651.000000 646.000000 646.000000 649.000000 649.000000 650.000000 650.000000 652.000000 652.000000 651.000000 651.000000 645.000000 647.000000
mean 19.215962 45170.629108 5.624233 5.946048e+04 2.855675 7.809647 77.526794 5.590368 18.911733 14.076104 27.651979 13.474862 10.642561 5.536025e+04 78.748129 38775.401840 80.323574 0.947699 42.681534 23.992347 7.033880 14.907301 44149.286810 17090.036810 18324.825153 13762.685583 14571.973926 15354.412577 10750.357362 11761.555215 7998.104294 10.885000 10.636656 23.483390 23.986012 6.171641 63.020828 16.390000 23.150000 12.702209 29.008221 8.804233 31.155844 12.393206 10.657147 3.253972 76.047853 72.412132 84.666319 77.069647 57.116626 65.847040 1931.239264 3.038865 1.137832e+04 13.715798 21.905031 30.579509 39.761610 2.179018 3.444632 4.747853 6.143037 75.763282 5.302960 7.452163 5.606672 2.780997 2.869233 2.612193 64.439525 7.83089 46.804586 2.579755 1797.153374 92.747883 62.434816 2.833482 32.559325 1964.822086 4.017209 0.500521 130855.375767 165164.895706 207417.072086 76561.696319 422.260736 535.31135 652.104294 229.843558 605.185583 27.204141 23.227301 13.182669 64.351227 25.808282 14.050905 54.592086 50.449969 76.195046 88.750890 386.645455 187.492182 328.300000 162.856455 2.316315e+05 94973.719909 548.017273 187.495455 83.070182 77.613000 7.260818 11.264182 1.458364 19.696182 19.381818 9.009091 131.661818 28.132362 3909.484509 4.185245 158.345455 2.768150e+07 87.694909 4.545455 0.994172 133338.572636 8.174847 6.437638 27.511521 32.601644 246.944700 200.858464 339.063467 395.167276 833.169492 1087.383436 2220.513846 3027.076031 652.769939 626.186319 38.989247 37.594240 633.092155 4771.453292
std 13.235541 23379.476968 2.891520 1.761253e+05 0.408726 11.403043 16.836132 7.078681 21.017267 3.468795 5.207060 4.374458 4.816070 1.771707e+05 39.884760 14992.840835 7.663542 0.720604 14.193613 8.329159 5.210186 4.670639 16305.422483 7518.350487 7664.944504 11781.611890 21453.858461 8306.152573 7484.655939 6152.004224 32504.649723 8.717475 9.042146 13.260618 13.906079 3.106506 8.291967 6.963107 6.493035 6.571939 10.689668 2.665646 7.469032 3.272733 2.920009 0.306269 9.115853 10.484327 10.127028 9.013849 7.681386 6.533787 9642.018433 2.968143 5.711001e+04 6.508180 9.211187 11.316008 13.783983 2.194139 3.377931 4.494551 5.832892 18.723682 6.274762 5.279596 4.428548 0.386238 0.347950 0.500160 15.662616 8.60621 16.396064 0.541450 6516.092183 5.937758 15.509878 3.600391 15.338566 10.768572 4.408227 0.548299 86051.081556 103692.962681 122233.431468 47804.118771 161.275321 188.10053 212.395831 86.642121 183.577918 2.978437 2.810971 1.903225 304.640980 172.904158 10.888643 12.661013 11.309996 10.228695 5.937045 920.540095 89.599880 825.405594 71.528482 5.986471e+05 56905.574568 320.126315 89.602001 12.075083 18.477372 10.003297 15.940309 2.661775 18.440615 42.284528 2.780982 100.500720 145.747217 4050.534161 5.706809 328.035477 5.733176e+07 9.596803 3.922732 2.635903 71346.126995 50.019629 8.563770 102.932993 29.623423 1633.680281 271.942800 1806.484322 362.778347 2758.257846 626.537587 7014.427370 1673.329990 2990.114617 605.970864 226.700972 43.713629 590.534673 2451.570076
min 1.000000 70.000000 1.000000 1.002300e+04 1.600000 0.000000 7.210000 0.060000 0.770000 4.580000 10.550000 4.930000 1.660000 0.000000e+00 0.000000 12908.000000 31.680000 0.000000 9.020000 4.810000 0.640000 3.460000 14257.000000 5237.000000 5472.000000 0.000000 0.000000 0.000000 0.000000 3188.000000 78.000000 0.730000 0.620000 2.520000 1.630000 1.610000 24.820000 3.060000 8.690000 1.370000 6.480000 2.350000 13.300000 4.490000 3.640000 2.500000 32.240000 26.110000 27.430000 30.640000 33.730000 44.650000 0.000000 0.000000 5.600000e+01 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 6.150000 0.000000 1.560000 0.480000 1.580000 1.610000 1.580000 13.930000 0.18000 3.060000 1.000000 36.000000 37.470000 16.860000 0.000000 3.120000 1939.000000 0.000000 0.000000 15700.000000 27200.000000 42200.000000 0.000000 99.000000 144.00000 226.000000 0.000000 192.000000 14.900000 15.300000 10.100000 0.000000 0.000000 0.510000 20.210000 11.830000 27.950000 49.800000 70.000000 47.030000 69.000000 25.280000 1.415500e+04 15370.900000 72.400000 47.000000 42.310000 1.600000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.900000 16.300000 0.000000 20.000000 4.288914e+06 35.010000 0.000000 0.000000 37041.600000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 3.230000 6.000000 49.170000 75.000000 396.560000 1.000000 7.210000 0.000000 0.000000 15.030000 696.610000
25% 7.000000 25770.000000 3.000000 1.559650e+04 2.610000 1.275000 68.767500 1.237500 4.420000 12.057500 24.670000 11.310000 7.480000 1.098100e+04 93.325000 27160.250000 76.090000 0.507500 31.957500 18.327500 3.020000 11.850000 31316.250000 11933.250000 13198.250000 8311.250000 8240.250000 10678.250000 6863.750000 7624.250000 918.250000 4.085000 4.537500 13.067500 13.527500 3.877500 58.337500 11.707500 19.095000 8.050000 21.442500 6.897500 26.310000 10.047500 8.637500 3.070000 70.917500 65.840000 78.687500 72.522500 51.855000 62.030000 161.750000 1.120000 1.200750e+03 9.107500 15.642500 23.212500 30.422500 0.620000 1.027500 1.437500 1.955000 69.727500 1.317500 4.137500 3.000000 2.540000 2.657500 2.247500 54.685000 2.32000 36.037500 2.000000 332.750000 90.530000 53.177500 0.780000 19.875000 1957.000000 0.730000 0.180000 63825.000000 81975.000000 105050.000000 41600.000000 302.500000 387.00000 479.000000 162.000000 455.750000 24.975000 21.475000 11.700000 0.000000 0.000000 5.742500 45.632500 42.310000 70.985000 86.175000 126.000000 125.192500 107.250000 112.335000 5.000000e+04 56083.725000 341.925000 125.200000 76.587500 70.150000 1.602500 2.862500 0.000000 7.937500 6.000000 7.000000 60.725000 5.700000 1501.675000 0.577500 52.000000 7.804816e+06 85.490000 0.000000 0.000000 91056.775000 0.000000 0.000000 3.000000 11.870000 9.000000 44.015000 28.000000 128.790000 130.000000 640.120000 390.250000 1799.760000 41.000000 217.457500 2.000000 11.165000 231.480000 2941.910000
50% 21.000000 45990.000000 6.000000 2.676400e+04 2.795000 3.425000 81.355000 3.320000 10.785000 13.765000 27.245000 12.605000 10.210000 2.352500e+04 100.000000 36308.000000 80.370000 0.730000 42.265000 23.465000 5.475000 14.290000 41761.000000 15627.000000 16674.500000 12222.500000 12238.000000 14159.000000 9148.500000 10264.000000 2221.500000 7.805000 7.935000 21.550000 20.365000 5.380000 63.310000 15.795000 22.375000 12.075000 26.965000 8.805000 29.865000 12.605000 10.715000 3.200000 76.060000 72.680000 85.605000 77.460000 57.230000 66.560000 432.500000 2.210000 3.057500e+03 13.610000 21.710000 30.735000 40.180000 1.415000 2.320000 3.315000 4.205000 81.475000 2.835000 5.595000 4.130000 2.730000 2.855000 2.540000 64.350000 5.03500 47.845000 3.000000 612.000000 94.685000 62.340000 1.780000 30.815000 1966.000000 2.250000 0.350000 123500.000000 153200.000000 187550.000000 61600.000000 428.500000 544.50000 659.500000 215.000000 613.000000 27.300000 23.400000 12.700000 0.000000 0.000000 11.160000 53.800000 50.220000 78.245000 89.890000 161.000000 161.125000 140.000000 138.020000 7.524050e+04 80827.600000 450.950000 161.150000 85.965000 82.620000 3.345000 7.430000 0.095000 14.920000 9.000000 9.000000 110.600000 11.600000 2739.000000 2.005000 71.000000 1.194086e+07 89.910000 5.000000 0.000000 118573.900000 1.000000 3.995000 8.000000 24.610000 32.000000 106.070000 87.000000 277.020000 288.000000 983.410000 792.000000 2747.070000 131.500000 433.590000 8.000000 25.220000 461.980000 4520.890000
75% 27.000000 63360.000000 8.000000 5.233275e+04 3.040000 8.917500 90.522500 6.635000 23.905000 15.480000 29.767500 14.475000 13.220000 5.198125e+04 100.000000 46892.500000 85.520000 1.162500 53.237500 29.192500 9.740000 17.380000 53311.250000 20130.000000 21471.000000 16412.500000 16644.500000 18218.750000 12565.750000 14357.500000 5541.500000 16.092500 13.322500 30.955000 31.772500 7.840000 68.592500 20.272500 26.230000 15.917500 35.010000 10.650000 34.792500 14.675000 12.745000 3.370000 82.675000 80.325000 92.537500 83.150000 62.312500 70.330000 1200.750000 3.782500 7.961000e+03 17.505000 27.945000 38.275000 49.940000 3.172500 4.907500 6.905000 8.855000 88.545000 6.542500 8.932500 6.357500 2.972500 3.020000 2.880000 75.432500 9.87000 57.645000 3.000000 1284.500000 96.307500 72.920000 3.592500 43.257500 1973.000000 5.892500 0.620000 175700.000000 213775.000000 275850.000000 97925.000000 535.000000 655.00000 814.500000 282.250000 728.000000 29.100000 25.000000 14.200000 25.250000 2.000000 19.002500 63.002500 58.712500 83.455000 92.572500 237.500000 234.467500 207.000000 202.740000 1.575000e+05 118636.875000 687.550000 234.475000 92.090000 89.612500 9.262500 12.350000 1.985000 24.750000 19.000000 11.000000 161.750000 23.900000 4966.375000 5.677500 114.750000 1.813866e+07 93.045000 10.000000 0.000000 158150.800000 4.000000 8.850000 19.000000 45.155000 116.500000 238.850000 232.750000 580.935000 665.000000 1415.030000 1773.500000 3862.060000 409.500000 857.282500 21.000000 48.000000 833.290000 5927.930000
max 81.000000 93480.000000 10.000000 3.485398e+06 5.280000 89.950000 99.170000 57.460000 95.290000 43.380000 66.640000 55.320000 52.770000 3.485398e+06 100.000000 123625.000000 96.760000 6.320000 79.430000 76.390000 26.920000 45.510000 131315.000000 63302.000000 68850.000000 212120.000000 480000.000000 82133.000000 125526.000000 51320.000000 643809.000000 48.820000 49.890000 73.660000 73.630000 23.830000 83.810000 41.690000 62.670000 44.270000 64.970000 19.090000 71.260000 23.460000 19.100000 4.640000 93.600000 92.580000 100.000000 97.340000 78.870000 83.960000 212238.000000 24.190000 1.336665e+06 38.520000 54.190000 69.760000 78.740000 13.710000 19.930000 25.340000 32.630000 97.330000 38.330000 34.870000 30.870000 4.520000 4.480000 4.730000 96.140000 59.49000 95.340000 4.000000 109558.000000 99.000000 96.220000 39.890000 78.590000 1987.000000 23.630000 5.330000 500001.000000 500001.000000 500001.000000 267700.000000 1001.000000 1001.00000 1001.000000 675.000000 1001.000000 35.100000 32.700000 23.400000 4597.000000 3109.000000 60.400000 92.040000 76.440000 95.850000 99.900000 8295.000000 623.660000 7683.000000 383.830000 5.480855e+06 316432.500000 2156.500000 623.700000 100.000000 100.000000 59.520000 98.400000 18.570000 98.400000 406.000000 14.000000 439.100000 3569.800000 44229.900000 52.590000 2482.000000 4.880505e+08 99.410000 10.000000 18.420000 546841.190000 1076.000000 63.940000 1773.000000 170.670000 38415.000000 2264.130000 42437.000000 2191.730000 50232.000000 4848.970000 119092.000000 22164.780000 59764.000000 4968.590000 5119.000000 377.610000 3928.030000 27010.770000
In [ ]:
#Subset the states with the least representation
min_rep = ['KS', 'DE', 'DC', 'AK', 'VT']
 
minrep_subset = crime_data[crime_data.state.isin(min_rep)]

minrep_subset.describe()
Out[ ]:
countyCode communityCode fold population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct12t29 agePct16t24 agePct65up numbUrban pctUrban medIncome pctWWage pctWFarmSelf pctWInvInc pctWSocSec pctWPubAsst pctWRetire medFamInc perCapInc whitePerCap blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap NumUnderPov PctPopUnderPov PctLess9thGrade PctNotHSGrad PctBSorMore PctUnemployed PctEmploy PctEmplManu PctEmplProfServ PctOccupManu PctOccupMgmtProf MalePctDivorce MalePctNevMarr FemalePctDiv TotalPctDiv PersPerFam PctFam2Par PctKids2Par PctYoungKids2Par PctTeen2Par PctWorkMomYoungKids PctWorkMom NumKidsBornNeverMar PctKidsBornNeverMar NumImmig PctImmigRecent PctImmigRec5 PctImmigRec8 PctImmigRec10 PctRecentImmig PctRecImmig5 PctRecImmig8 PctRecImmig10 PctSpeakEnglOnly PctNotSpeakEnglWell PctLargHouseFam PctLargHouseOccup PersPerOccupHous PersPerOwnOccHous PersPerRentOccHous PctPersOwnOccup PctPersDenseHous PctHousLess3BR MedNumBR HousVacant PctHousOccup PctHousOwnOcc PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctHousNoPhone PctWOFullPlumb OwnOccLowQuart OwnOccMedVal OwnOccHiQuart OwnOccQrange RentLowQ RentMedian RentHighQ RentQrange MedRent MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg NumInShelters NumStreet PctForeignBorn PctBornSameState PctSameHouse85 PctSameCity85 PctSameState85 LemasSwornFT LemasSwFTPerPop LemasSwFTFieldOps LemasSwFTFieldPerPop LemasTotalReq LemasTotReqPerPop PolicReqPerOffic PolicPerPop RacialMatchCommPol PctPolicWhite PctPolicBlack PctPolicHisp PctPolicAsian PctPolicMinor OfficAssgnDrugUnits NumKindsDrugsSeiz PolicAveOTWorked LandArea PopDens PctUsePubTrans PolicCars PolicOperBudg LemasPctPolicOnPatr LemasGangUnitDeploy LemasPctOfficDrugUn PolicBudgPerPop murders murdPerPop rapes rapesPerPop robberies robbbPerPop assaults assaultPerPop burglaries burglPerPop larcenies larcPerPop autoTheft autoTheftPerPop arsons arsonsPerPop ViolentCrimesPerPop nonViolPerPop
count 6.000000 6.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.00000 10.000000 10.00000 10.00000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.00000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.00000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.00000 10.000000 10.000000 10.00000 10.00000 10.000000 10.000000 10.000000 10.000000 10.000000 10.00000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.00000 10.00000 10.000000 10.000000 10.000000 10.000000 10.000000 10.00000 4.000000 4.000000 4.000000 4.000000 4.000000 4.000000 4.00000 4.000000 4.000000 4.000000 4.000000 4.000000 4.00000 4.000000 4.000000 4.00000 4.000000 10.000000 10.00000 10.000000 4.000000 4.000000e+00 4.000000 4.000000 10.000000 4.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 5.000000 5.000000 10.000000 5.000000
mean 36.000000 46991.666667 5.600000 130737.000000 2.644000 13.102000 80.219000 2.356000 2.900000 15.926000 31.567000 17.090000 8.383000 122225.900000 66.679000 34658.500000 83.24900 0.890000 53.73300 19.05700 6.652000 14.514000 40603.800000 16394.200000 18934.400000 11684.700000 10802.300000 11941.500000 7550.700000 11422.800000 16641.300000 10.43400 5.671000 15.547000 27.992000 5.550000 67.669000 10.500000 27.004000 9.710000 31.997000 11.131000 37.087000 14.195000 12.698000 3.070000 69.078000 66.923000 78.012000 71.999000 64.057000 72.826000 8707.900000 4.84600 9168.900000 16.070000 23.136000 32.390000 38.655000 0.826000 1.221000 1.694000 2.029000 91.849000 0.999000 4.577000 2.926000 2.478000 2.64700 2.272000 56.559000 3.66400 53.05900 2.500000 5759.800000 91.135000 52.967000 5.460000 37.11200 1962.100000 4.333000 0.703000 79560.000000 102790.000000 142730.000000 63170.000000 353.300000 458.800000 576.300000 223.000000 523.100000 25.580000 21.270000 12.57000 540.20000 21.400000 4.892000 45.596000 42.259000 72.017000 78.10500 1321.500000 321.770000 1195.500000 292.042500 342453.750000 103870.247500 467.72500 321.775000 93.342500 78.467500 18.317500 1.822500 0.84500 20.410000 57.250000 10.00000 110.825000 475.750000 2144.41000 5.266000 273.000000 6.771188e+07 91.565000 3.750000 1.568000 183933.272500 53.300000 13.211000 91.500000 62.496000 917.700000 250.089000 1187.900000 374.653000 2093.700000 1109.321000 6643.600000 4542.293000 1284.500000 463.628000 65.000000 29.056000 700.452000 6282.766000
std 67.441827 28123.129567 2.458545 195996.657595 0.183073 20.853535 21.288761 1.411486 1.969202 4.659438 6.288545 6.490518 3.757798 200855.131413 46.991043 8091.644803 5.91336 0.282921 17.13674 7.47369 2.777384 3.086009 7881.706186 2864.852131 6263.841867 4743.836166 3944.801165 3917.865894 4963.835637 3793.149462 30194.074621 5.25849 2.456413 6.050423 6.726027 1.799451 5.265294 7.166563 4.380231 2.491113 5.597257 2.143805 9.211694 2.128407 2.070452 0.110454 10.550211 13.091965 10.966617 10.505742 3.713286 4.229474 21871.270031 4.93539 18148.978878 7.175449 8.711964 9.665053 12.176344 0.594516 0.907738 1.151831 1.383598 2.375563 0.713621 1.920151 0.990311 0.159081 0.12979 0.180358 13.593108 2.54648 7.30352 0.527046 9292.176049 3.579349 13.167262 4.960423 9.26628 13.763478 2.358983 0.677004 16780.623747 21183.140571 47476.030209 39215.559554 73.063215 89.197409 110.048525 54.108944 86.992273 2.081559 1.263197 1.22479 1458.76613 41.414437 1.934378 14.947427 7.373137 9.247546 8.72826 2127.326413 332.327943 1917.553041 298.699678 376930.265812 47184.416303 263.97143 332.352317 4.302126 31.166043 30.793664 1.280478 1.24141 30.252446 94.026149 2.94392 85.098938 946.994925 2844.69981 11.358544 238.753709 9.423337e+07 2.175416 4.787136 2.240812 131168.014654 141.648273 25.019023 124.392971 31.185403 2215.698036 391.183905 2792.666167 484.606807 3763.327902 545.433585 10214.814233 1273.663253 2555.521397 447.962126 86.709284 14.479887 902.569245 2102.843563
min 1.000000 10675.000000 2.000000 12809.000000 2.420000 0.390000 29.600000 0.460000 0.450000 11.720000 24.660000 10.530000 3.480000 0.000000 0.000000 25434.000000 72.91000 0.420000 34.26000 9.30000 1.140000 10.970000 31083.000000 12984.000000 13023.000000 6507.000000 5974.000000 6598.000000 0.000000 7964.000000 453.000000 3.62000 2.810000 8.970000 18.340000 2.460000 59.670000 2.350000 21.640000 7.260000 23.610000 7.970000 28.450000 10.330000 9.280000 2.930000 41.960000 33.580000 52.230000 44.320000 58.430000 67.490000 124.000000 1.38000 473.000000 7.400000 8.880000 15.860000 15.860000 0.190000 0.230000 0.410000 0.410000 87.490000 0.110000 2.480000 1.780000 2.260000 2.50000 2.020000 31.880000 0.73000 45.87000 2.000000 259.000000 85.220000 31.090000 0.390000 25.00000 1939.000000 0.660000 0.250000 41600.000000 56700.000000 78000.000000 36400.000000 227.000000 299.000000 385.000000 155.000000 395.000000 23.700000 19.500000 10.90000 0.00000 0.000000 2.590000 25.240000 28.170000 52.730000 59.20000 91.000000 104.930000 83.000000 99.410000 40473.000000 42529.000000 193.40000 104.900000 89.390000 32.200000 0.000000 0.000000 0.00000 0.000000 2.000000 6.00000 0.000000 7.900000 10.00000 0.290000 29.000000 6.124758e+06 90.070000 0.000000 0.000000 86495.300000 0.000000 0.000000 3.000000 19.630000 0.000000 0.000000 1.000000 6.540000 117.000000 443.000000 370.000000 2420.830000 13.000000 85.060000 2.000000 5.850000 26.170000 3637.430000
25% 7.000000 23656.250000 4.000000 20360.250000 2.470000 0.670000 74.420000 1.580000 0.997500 12.987500 27.702500 12.900000 4.875000 2534.500000 17.205000 28699.750000 79.68000 0.775000 41.14250 12.14000 6.312500 12.342500 34943.250000 14207.750000 15437.500000 8634.500000 8367.500000 9472.000000 4290.750000 9250.250000 1632.250000 5.96000 3.557500 10.157500 23.682500 4.240000 63.130000 4.132500 22.905000 7.610000 28.460000 9.442500 30.772500 13.212500 11.452500 2.975000 66.645000 63.977500 76.795000 71.767500 61.520000 69.590000 329.250000 2.57500 766.500000 10.310000 17.052500 25.772500 30.862500 0.505000 0.860000 1.325000 1.507500 90.505000 0.567500 3.050000 1.950000 2.345000 2.53250 2.150000 47.117500 1.40500 47.82000 2.000000 622.750000 88.290000 43.382500 2.412500 32.25250 1949.750000 3.452500 0.287500 72875.000000 90350.000000 115875.000000 43800.000000 305.750000 403.500000 500.500000 180.750000 472.250000 24.075000 20.350000 11.60000 14.50000 0.250000 3.820000 35.297500 38.192500 68.627500 74.97000 222.250000 127.400000 209.750000 115.970000 90976.500000 88653.025000 352.32500 127.400000 89.742500 75.152500 3.105000 1.410000 0.00000 5.677500 10.250000 9.00000 70.950000 18.425000 474.45000 0.637500 136.250000 2.187468e+07 90.197500 0.000000 0.000000 108644.900000 0.000000 0.000000 9.750000 43.902500 3.250000 13.567500 14.250000 70.010000 142.500000 747.592500 728.750000 3678.982500 28.250000 142.757500 7.000000 24.230000 125.725000 5535.310000
50% 7.000000 55612.500000 5.500000 29236.500000 2.685000 3.760000 81.515000 1.965000 2.810000 13.750000 29.405000 14.555000 9.055000 20169.000000 98.985000 31670.500000 82.05500 0.865000 47.63500 20.44000 6.830000 13.645000 37157.000000 14936.500000 16479.500000 11166.500000 9841.000000 11700.000000 8708.500000 9801.500000 3000.000000 11.24500 5.315000 15.710000 27.575000 6.020000 68.585000 11.115000 27.390000 9.095000 31.565000 11.875000 33.380000 14.530000 13.190000 3.050000 72.870000 70.855000 78.900000 73.125000 63.930000 71.920000 775.500000 3.33000 1416.500000 14.225000 22.830000 32.600000 38.995000 0.700000 1.035000 1.480000 1.725000 92.520000 0.790000 4.510000 2.990000 2.455000 2.62500 2.235000 58.445000 3.29500 50.03000 2.500000 837.500000 92.130000 53.650000 4.065000 35.80500 1969.000000 4.360000 0.505000 84200.000000 111600.000000 143950.000000 56350.000000 346.000000 454.000000 583.000000 220.500000 508.000000 25.100000 21.400000 12.75000 48.00000 2.000000 4.505000 45.480000 42.560000 73.640000 78.64000 344.500000 184.395000 316.500000 167.410000 228905.500000 107817.950000 425.05000 184.400000 93.330000 90.835000 4.435000 2.225000 0.37500 8.110000 14.500000 10.50000 122.700000 35.400000 1100.80000 1.195000 236.500000 2.826889e+07 90.725000 2.500000 0.000000 136726.300000 1.000000 4.120000 25.500000 62.655000 43.500000 127.315000 88.500000 219.560000 251.000000 907.585000 1623.000000 4727.200000 84.000000 248.415000 11.000000 36.100000 461.230000 5712.280000
75% 17.500000 64937.500000 7.000000 179535.250000 2.777500 12.540000 96.972500 3.100000 4.787500 18.217500 34.950000 20.187500 10.740000 176090.500000 100.000000 41595.500000 88.58000 1.120000 62.96000 23.59750 8.122500 16.110000 47186.000000 19435.250000 21030.500000 13316.500000 13261.250000 14098.250000 10198.500000 12570.750000 13352.500000 12.46500 7.807500 18.045000 32.660000 6.422500 71.245000 14.332500 29.802500 11.115000 36.330000 12.895000 38.682500 15.995000 14.375000 3.172500 74.970000 73.390000 82.762500 77.340000 66.407500 75.075000 3705.500000 4.25750 9601.250000 21.602500 29.967500 37.975000 44.575000 0.940000 1.247500 1.572500 2.070000 92.725000 1.357500 5.300000 3.630000 2.620000 2.74750 2.385000 62.820000 5.22750 57.14750 3.000000 9001.250000 94.085000 58.750000 5.772500 41.64250 1971.750000 5.150000 0.760000 88600.000000 118750.000000 152350.000000 59500.000000 406.250000 514.250000 651.750000 270.500000 560.500000 25.850000 21.875000 13.10000 171.00000 26.250000 5.212500 57.707500 46.192500 77.885000 83.39250 1443.750000 378.765000 1302.250000 343.482500 480382.750000 123035.172500 540.45000 378.775000 96.930000 94.150000 19.647500 2.637500 1.22000 22.842500 61.500000 11.50000 162.575000 105.375000 2488.37500 3.132500 373.250000 7.410609e+07 92.092500 6.250000 2.852500 212014.672500 18.750000 13.750000 168.000000 80.992500 446.000000 258.875000 863.500000 477.072500 1565.000000 1458.805000 8562.250000 5570.495000 1100.500000 665.182500 105.000000 37.680000 874.262500 7276.460000
max 173.000000 79000.000000 10.000000 606900.000000 2.920000 65.840000 98.930000 4.820000 5.390000 26.330000 45.340000 32.370000 15.230000 606900.000000 100.000000 47924.000000 90.83000 1.270000 89.04000 33.08000 10.930000 21.120000 54088.000000 20125.000000 34563.000000 23000.000000 17500.000000 18666.000000 13713.000000 20971.000000 96278.000000 19.34000 9.560000 26.850000 39.120000 8.620000 75.000000 25.410000 33.610000 14.330000 40.150000 13.670000 55.640000 16.430000 15.030000 3.210000 77.390000 78.060000 95.600000 82.350000 69.420000 79.410000 70523.000000 18.20000 58887.000000 28.890000 36.960000 48.350000 57.550000 2.330000 3.590000 4.690000 5.580000 95.170000 2.550000 8.960000 4.600000 2.680000 2.84000 2.590000 74.490000 8.25000 67.33000 3.000000 28855.000000 95.240000 71.630000 17.400000 56.91000 1976.000000 8.200000 2.490000 98300.000000 123900.000000 258700.000000 172000.000000 454.000000 587.000000 736.000000 301.000000 664.000000 30.100000 23.500000 14.90000 4682.00000 131.000000 9.700000 67.860000 53.490000 84.460000 89.09000 4506.000000 813.360000 4066.000000 733.940000 871531.000000 157316.090000 827.40000 813.400000 97.320000 100.000000 64.400000 2.840000 2.63000 65.420000 198.000000 13.00000 197.900000 2686.900000 9538.90000 37.300000 590.000000 2.081850e+08 94.740000 10.000000 6.020000 375785.190000 454.000000 81.950000 324.000000 123.330000 7107.000000 1282.850000 9003.000000 1625.090000 11532.000000 2081.590000 31466.000000 6118.530000 8060.000000 1454.870000 200.000000 41.420000 3048.380000 9252.350000

* MODULE 3: INITIAL CODE AND RESULTS

In [ ]:
#az = crime_data.loc[crime_data['state'] == "CA"]
#az.boxplot(column="murdPerPop")

#dc = crime_data.loc[crime_data['state']== "DC"]

Previously, we have discovered a number of missing values in the dataset. The number of values missing in each column varies. At this point, there are missing number values which are significant and cannot be used in the analysis. These columns will have to be dropped. I have decided on a NaN threshold of 50% (0.5), so any columns which are missing 50% of its values or more will be dropped from the dataset.

In [ ]:
#after deciding on a NaN ratio threshold, drop the columns which exceed the threshold (50%)

crimedata_reduced= crime_data.drop(['countyCode', 'communityCode', 'fold', 'LemasSwornFT', 'LemasSwFTPerPop', 'LemasSwFTFieldOps', 'LemasSwFTFieldPerPop', 'LemasTotalReq', 'LemasTotReqPerPop', 'PolicReqPerOffic', 'PolicPerPop', 'RacialMatchCommPol', 'PctPolicWhite', 'PctPolicBlack', 'PctPolicHisp', 'PctPolicAsian', 'PctPolicMinor', 'OfficAssgnDrugUnits', 'NumKindsDrugsSeiz', 'PolicAveOTWorked', 'PolicCars', 'PolicOperBudg', 'LemasPctPolicOnPatr', 'LemasGangUnitDeploy', 'PolicBudgPerPop'], axis=1)
crimedata_reduced.head()
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct12t29 agePct16t24 agePct65up numbUrban pctUrban medIncome pctWWage pctWFarmSelf pctWInvInc pctWSocSec pctWPubAsst pctWRetire medFamInc perCapInc whitePerCap blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap NumUnderPov PctPopUnderPov PctLess9thGrade PctNotHSGrad PctBSorMore PctUnemployed PctEmploy PctEmplManu PctEmplProfServ PctOccupManu PctOccupMgmtProf MalePctDivorce MalePctNevMarr FemalePctDiv TotalPctDiv PersPerFam PctFam2Par PctKids2Par PctYoungKids2Par PctTeen2Par PctWorkMomYoungKids PctWorkMom NumKidsBornNeverMar PctKidsBornNeverMar NumImmig PctImmigRecent PctImmigRec5 PctImmigRec8 PctImmigRec10 PctRecentImmig PctRecImmig5 PctRecImmig8 PctRecImmig10 PctSpeakEnglOnly PctNotSpeakEnglWell PctLargHouseFam PctLargHouseOccup PersPerOccupHous PersPerOwnOccHous PersPerRentOccHous PctPersOwnOccup PctPersDenseHous PctHousLess3BR MedNumBR HousVacant PctHousOccup PctHousOwnOcc PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctHousNoPhone PctWOFullPlumb OwnOccLowQuart OwnOccMedVal OwnOccHiQuart OwnOccQrange RentLowQ RentMedian RentHighQ RentQrange MedRent MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg NumInShelters NumStreet PctForeignBorn PctBornSameState PctSameHouse85 PctSameCity85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murders murdPerPop rapes rapesPerPop robberies robbbPerPop assaults assaultPerPop burglaries burglPerPop larcenies larcPerPop autoTheft autoTheftPerPop arsons arsonsPerPop ViolentCrimesPerPop nonViolPerPop
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 21.44 10.93 11.33 11980 100.0 75122 89.24 1.55 70.20 23.62 1.03 18.39 79584 29711 30233 13600 5725 27101 5115.0 22838 227 1.96 5.81 9.90 48.18 2.70 64.55 14.65 28.82 5.49 50.73 3.67 26.38 5.22 4.47 3.22 91.43 90.17 95.78 95.81 44.56 58.88 31 0.36 1277 8.69 13.00 20.99 30.93 0.93 1.39 2.24 3.30 85.68 1.37 4.81 4.17 2.99 3.00 2.84 91.46 0.39 11.06 3 64 98.37 91.01 3.12 37.50 1959 0.00 0.28 215900 262600 326900 111000 685 1001 1001 316 1001 23.8 21.1 14.0 11 0 10.66 53.72 65.29 78.09 89.14 6.5 1845.9 9.63 0.0 0 0.0 0.0 0.00 1.0 8.20 4.0 32.81 14.0 114.85 138.0 1132.08 16.0 131.26 2.0 16.41 41.02 1394.59
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 21.30 10.48 17.18 23123 100.0 47917 78.99 1.11 64.11 35.50 2.75 22.85 55323 20148 20191 18137 0 20074 5250.0 12222 885 3.98 5.61 13.72 29.89 2.43 61.96 12.26 29.28 6.39 37.64 4.23 27.99 6.45 5.42 3.11 86.91 85.33 96.82 86.46 51.14 62.43 43 0.24 1920 5.21 8.65 13.33 22.50 0.43 0.72 1.11 1.87 87.79 1.81 4.25 3.34 2.70 2.83 1.96 89.03 1.01 23.60 3 240 97.15 84.88 0.00 18.33 1958 0.31 0.14 136300 164200 199900 63600 467 560 672 205 627 27.6 20.7 12.5 0 0 8.30 77.17 71.27 90.22 96.12 10.6 2186.7 3.84 0.0 0 0.0 1.0 4.25 5.0 21.26 24.0 102.05 57.0 242.37 376.0 1598.78 26.0 110.55 1.0 4.25 127.56 1955.95
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 25.88 11.01 10.28 29344 100.0 35669 82.00 1.15 55.73 22.25 2.94 14.56 42112 16946 17103 16644 21606 15528 5954.0 8405 1389 4.75 2.80 9.09 30.13 4.01 69.80 15.95 21.52 8.79 32.48 10.10 25.78 14.76 12.55 2.95 78.54 78.85 92.37 75.72 66.08 74.19 164 0.88 1468 16.42 23.98 32.08 35.63 0.82 1.20 1.61 1.78 93.11 1.14 2.97 2.05 2.42 2.69 2.06 64.18 2.03 47.46 3 544 95.68 57.79 0.92 7.54 1976 1.55 0.12 74700 90400 112000 37300 370 428 520 150 484 24.1 21.7 11.6 16 0 5.00 44.77 36.60 61.26 82.85 10.6 2780.9 4.37 0.0 3 8.3 6.0 16.60 56.0 154.95 14.0 38.74 274.0 758.14 1797.0 4972.19 136.0 376.30 22.0 60.87 218.59 6167.51
3 Gloversvillecity NY 16656 2.40 1.70 97.35 0.50 0.70 12.55 25.20 12.19 17.57 0 0.0 20580 68.15 0.24 38.95 39.48 11.71 18.33 26501 10810 10909 9984 4941 3541 2451.0 4391 2831 17.23 11.05 33.68 10.81 9.86 54.74 31.22 27.43 26.76 22.71 10.98 28.15 14.47 12.91 2.98 64.02 62.36 65.38 67.43 59.59 70.27 561 3.84 339 13.86 13.86 15.34 15.34 0.28 0.28 0.31 0.31 94.98 0.56 3.93 2.56 2.37 2.51 2.20 58.18 1.21 45.66 3 669 91.19 54.89 2.54 57.85 1939 7.00 0.87 36400 49600 66500 30100 195 250 309 114 333 28.7 20.6 14.5 0 0 2.04 88.71 56.70 90.17 96.24 5.2 3217.7 3.31 0.0 0 0.0 10.0 57.86 10.0 57.86 33.0 190.93 225.0 1301.78 716.0 4142.56 47.0 271.93 NaN NaN 306.64 NaN
4 Bemidjicity MN 11245 2.76 0.53 89.16 1.17 0.52 24.46 40.53 28.69 12.65 0 0.0 17390 69.33 0.55 42.82 32.16 11.21 14.43 24018 8483 9009 887 4425 3352 3000.0 1328 2855 29.99 12.15 23.06 25.28 9.08 52.44 6.89 36.54 10.94 27.80 7.51 50.66 11.64 9.73 2.98 58.59 55.20 66.51 79.17 61.22 68.94 402 4.70 196 46.94 56.12 67.86 69.90 0.82 0.98 1.18 1.22 94.64 0.39 5.23 3.11 2.35 2.55 2.12 58.13 2.94 55.64 2 333 92.45 53.57 3.90 42.64 1958 7.45 0.82 30600 43200 59500 28900 202 283 362 160 332 32.2 23.2 12.9 2 0 1.74 73.75 42.22 60.34 89.02 11.5 974.2 0.38 0.0 0 0.0 NaN NaN 4.0 32.04 14.0 112.14 91.0 728.93 1060.0 8490.87 91.0 728.93 5.0 40.05 NaN 9988.79

First, I will decide on my target/dependent variables. The two I have chosen are MURDERS and ROBBERIES. Many of my potential dependent variables are missing values, and are not able to be imputed as it may introduce bias. For this reason, I have chosen these 2 categories as my dependent variables because they have the LEAST amount of missing values. Now that my dependent variables are chosen, I can move on to dealing with my independent variables.

There are still a number of columns that remain with missing values. The number of NaNs, however, were below the threshold of 0.5 so they will be used in the analysis. To deal with these missing values, we will turn to imputation. For the columns of our independent variables with 1-15 total missing values, I have chosen to impute them with the overall mode of the column. Because the distribution of the columns is skewed, I have chosen to impute with the mode value per each individual column.

However, there are still a number of columns with a significant number of missing values, but fall below the threshold. These will have to be dealt with differently. For these values, I have chosen to aggregate the data first, and then choose my imputation method. The nature of the organization of the crimes is by neighbourhood, which are each found in a state. I thought aggregrating by state was the most logical choice, as we may find more geographical, social, and economic similarties overall (but not entirely) within each state and its population. Because there are a different number of neighbourhoods represented for each state, the representation is unbalanced. Because of this, I thought the most appropriate imputation method would be to impute these missing values with the mean by state, per each column.

In [ ]:
#impute missing values
#for columns with very small number of missing values, impute with MODE

crimedata_reduced['burglaries'].fillna(crimedata_reduced['burglaries'].mode()[0], inplace=True)
crimedata_reduced['burglPerPop'].fillna(crimedata_reduced['burglPerPop'].mode()[0], inplace=True)
crimedata_reduced['larcenies'].fillna(crimedata_reduced['larcenies'].mode()[0], inplace=True)
crimedata_reduced['larcPerPop'].fillna(crimedata_reduced['larcPerPop'].mode()[0], inplace=True)
crimedata_reduced['autoTheft'].fillna(crimedata_reduced['autoTheft'].mode()[0], inplace=True)
crimedata_reduced['autoTheftPerPop'].fillna(crimedata_reduced['autoTheftPerPop'].mode()[0], inplace=True)
crimedata_reduced['OtherPerCap'].fillna(crimedata_reduced['OtherPerCap'].mode()[0], inplace=True)
crimedata_reduced['assaults'].fillna(crimedata_reduced['assaults'].mode()[0], inplace=True)
crimedata_reduced['assaultPerPop'].fillna(crimedata_reduced['assaultPerPop'].mode()[0], inplace=True)
In [ ]:
#Find out the total counts of how many times each state is represented in the dataset
#With this information, I have chosen to impute by the mean value of the column, aggregated by state

#crimedata_reduced[crimedata_reduced['rapes'].isnull()]
#crimedata_reduced.pivot_table(index = ['state'], aggfunc ='size')

from collections import Counter
print(Counter(crimedata_reduced['state']))
Counter({'CA': 279, 'NJ': 211, 'TX': 162, 'MA': 123, 'OH': 111, 'MI': 108, 'PA': 101, 'FL': 90, 'CT': 71, 'MN': 66, 'WI': 60, 'IN': 48, 'NY': 46, 'NC': 46, 'AL': 43, 'MO': 42, 'WA': 40, 'IL': 40, 'GA': 37, 'OK': 36, 'TN': 35, 'VA': 33, 'OR': 31, 'SC': 28, 'KY': 26, 'RI': 26, 'AR': 25, 'CO': 25, 'UT': 24, 'LA': 22, 'NH': 21, 'MS': 20, 'AZ': 20, 'IA': 20, 'ME': 17, 'WV': 14, 'MD': 12, 'NM': 10, 'SD': 9, 'ND': 8, 'WY': 7, 'ID': 7, 'NV': 5, 'VT': 4, 'AK': 3, 'KS': 1, 'DE': 1, 'DC': 1})

At this point, I will go through each state, and aggregate them one by one into their own, new data frames. For each seperate state dataframe, I will check for missing values. Rather than checking for the total number, I want the column names of where these missing values are located. I will then confirm the total length/number of rows in the dataset, and then check the total number of missing values in each column to get an idea of how much information is missing in each column per state. If the ratio is not significant, I will then move forward and impute the missing values as discussed previously, and then confirm if there are still any columns left with missing values.

Once confirmed that there are no longer any columns missing values, I will have to put these values back into the main dataset. I have filtered out the main dataset by state, so I will create a new dataframe titled crimedata_new. This new dataframe will include all the data, EXCEPT the data of the particular state I was working with. I will then append the seperate state dataframe to crimedata_new, replacing the old data of the state in which values were missing. Once again, I will confirm that all missing values were imputed as planned, by checking the count of missing values of crimedata_new, indexed by the particular state.

STATE: CA

In [ ]:
#Separate all rows represented by the state CA, and create a dataframe with them
CA = crimedata_reduced[crimedata_reduced['state']=='CA']
CA_df= pd.DataFrame(CA)

#Check for missing values: return names of each column that carries a missing value
CA_df.columns[CA_df.isna().any()]

#Check the total number of rows represented by the particular state
len(CA_df)

#Check total number of missing values in that column to establish the ratio of missing information that exists here
CA_df['ViolentCrimesPerPop'].isna().sum()

#If the ratio of missing information is not significant, impute the missing values with the mean of that column of its own state
CA_df['ViolentCrimesPerPop'].fillna(CA_df['ViolentCrimesPerPop'].mean(), inplace = True)

#Confirm values were successfully imputed
CA_df.columns[CA_df.isna().any()]
Out[ ]:
Index([], dtype='object')
In [ ]:
#Create a dataframe where we filter out all rows represented by the particular state
ca_filtered = crimedata_reduced[crimedata_reduced['state'] != 'CA']

#Create a new dataframe which will serve as our main, fully imputed dataframe
#In this new dataframe, take the filtered dataframe and append the imputed state dataframe to it
crimedata_new = pd.concat([ca_filtered, CA_df], ignore_index=True)

#Confirm there are no missing values in the rows of the particular state in the new main dataframe
Counter(crimedata_new[crimedata_new['state'] == "CA"].isna().any())
Out[ ]:
Counter({False: 122})

I will repeat the previous steps with every single state represented. If no columns are returned with missing values, I will move onto the next state.

STATE: NJ

In [ ]:
NJ = crimedata_reduced[crimedata_reduced['state']=='NJ']
NJ_df= pd.DataFrame(NJ)
NJ_df.columns[NJ_df.isna().any()]
Out[ ]:
Index([], dtype='object')

STATE: TX

In [ ]:
TX = crimedata_reduced[crimedata_reduced['state']=='TX']
TX_df= pd.DataFrame(TX)
TX_df.columns[TX_df.isna().any()]

len(TX_df)
TX_df['rapes'].isna().sum()
TX_df['rapesPerPop'].isna().sum()
TX_df['arsons'].isna().sum()
TX_df['arsonsPerPop'].isna().sum()
TX_df['ViolentCrimesPerPop'].isna().sum()
TX_df['nonViolPerPop'].isna().sum()
Out[ ]:
5
In [ ]:
TX_df['rapes'].fillna(TX_df['rapes'].mean(), inplace = True)
TX_df['rapesPerPop'].fillna(TX_df['rapesPerPop'].mean(), inplace = True)
TX_df['arsons'].fillna(TX_df['arsons'].mean(), inplace = True)
TX_df['arsonsPerPop'].fillna(TX_df['arsonsPerPop'].mean(), inplace = True)
TX_df['ViolentCrimesPerPop'].fillna(TX_df['ViolentCrimesPerPop'].mean(), inplace = True)
TX_df['nonViolPerPop'].fillna(TX_df['nonViolPerPop'].mean(), inplace = True)

TX_df.columns[TX_df.isna().any()]

tx_filtered = crimedata_reduced[crimedata_reduced['state'] != 'TX']
crimedata_new = pd.concat([tx_filtered, TX_df], ignore_index=True)

Counter(crimedata_new[crimedata_new['state'] == "TX"].isna().any())
Out[ ]:
Counter({False: 120, True: 2})

STATE: MA

In [ ]:
MA = crimedata_reduced[crimedata_reduced['state']=='MA']
MA_df= pd.DataFrame(MA)
MA_df.columns[MA_df.isna().any()]

len(MA_df)
MA_df['arsons'].isna().sum()
MA_df['arsonsPerPop'].isna().sum()
MA_df['ViolentCrimesPerPop'].isna().sum()
MA_df['nonViolPerPop'].isna().sum()
Out[ ]:
7
In [ ]:
MA_df['arsons'].fillna(MA_df['arsons'].mean(), inplace = True)
MA_df['arsonsPerPop'].fillna(MA_df['arsonsPerPop'].mean(), inplace = True)
MA_df['ViolentCrimesPerPop'].fillna(MA_df['ViolentCrimesPerPop'].mean(), inplace = True)
MA_df['nonViolPerPop'].fillna(MA_df['nonViolPerPop'].mean(), inplace = True)

MA_df.columns[MA_df.isna().any()]

ma_filtered = crimedata_reduced[crimedata_reduced['state'] != 'MA']
crimedata_new = pd.concat([ma_filtered, MA_df], ignore_index=True)

Counter(crimedata_new[crimedata_new['state'] == "MA"].isna().any())
Out[ ]:
Counter({False: 122})

STATE: OH

In [ ]:
OH = crimedata_reduced[crimedata_reduced['state']=='OH']
OH_df= pd.DataFrame(OH)
OH_df.columns[OH_df.isna().any()]

len(OH_df)
OH_df['arsons'].isna().sum()
OH_df['arsonsPerPop'].isna().sum()
OH_df['ViolentCrimesPerPop'].isna().sum()
OH_df['nonViolPerPop'].isna().sum()
Out[ ]:
3
In [ ]:
OH_df['arsons'].fillna(OH_df['arsons'].mean(), inplace = True)
OH_df['arsonsPerPop'].fillna(OH_df['arsonsPerPop'].mean(), inplace = True)
OH_df['ViolentCrimesPerPop'].fillna(OH_df['ViolentCrimesPerPop'].mean(), inplace = True)
OH_df['nonViolPerPop'].fillna(OH_df['nonViolPerPop'].mean(), inplace = True)

OH_df.columns[OH_df.isna().any()]

oh_filtered = crimedata_reduced[crimedata_reduced['state'] != 'OH']
crimedata_new = pd.concat([oh_filtered, OH_df], ignore_index=True)

Counter(crimedata_new[crimedata_new['state'] == "OH"].isna().any())
Out[ ]:
Counter({False: 122})

***For the state MI, I have run into a problem. Out of the 6 columns which include missing values, 3 of the columns are able to be imputed, as they are not missing a significant amount of values. However the other 3 are missing ALL values in the entire column. This particular state is represented by 108 rows. The columns, rapes, rapesPerPop and ViolentCrimesPerPop are missing all 108 values.

STATE: MI

In [ ]:
MI = crimedata_reduced[crimedata_reduced['state']=='MI']
MI_df= pd.DataFrame(MI)
MI_df.columns[MI_df.isna().any()]

len(MI_df)

#TOTAL rows for MI was 108
#The three columns beneath are missing ALL 108 values in these columns, there are no values to work with in order to impute the mean
MI_df['ViolentCrimesPerPop'].isna().sum()
MI_df['rapes'].isna().sum()
MI_df['rapesPerPop'].isna().sum()

#The following rows are not missing an insignificant amount of values, they can be imputed as planned
MI_df['arsons'].isna().sum()
MI_df['arsonsPerPop'].isna().sum()
MI_df['nonViolPerPop'].isna().sum()
Out[ ]:
1
In [ ]:
#Impute the columns that are appropriate to be imputed by the mean

MI_df['arsons'].fillna(MI_df['arsons'].mean(), inplace = True)
MI_df['arsonsPerPop'].fillna(MI_df['arsonsPerPop'].mean(), inplace = True)
MI_df['nonViolPerPop'].fillna(MI_df['nonViolPerPop'].mean(), inplace = True)

MI_df.columns[MI_df.isna().any()]

mi_filtered = crimedata_reduced[crimedata_reduced['state'] != 'MI']
crimedata_new = pd.concat([mi_filtered, MI_df], ignore_index=True)

Counter(crimedata_new[crimedata_new['state'] == "MI"].isna().any())
MI_df.columns[MI_df.isna().any()]

#There are still 3 columns that have not been addressed
#These columns must be revisited
Out[ ]:
Index(['rapes', 'rapesPerPop', 'ViolentCrimesPerPop'], dtype='object')

STATE: PA

In [ ]:
PA = crimedata_reduced[crimedata_reduced['state']=='PA']
PA_df= pd.DataFrame(PA)
PA_df.columns[PA_df.isna().any()]

len(PA_df)
PA_df['arsons'].isna().sum()
PA_df['arsonsPerPop'].isna().sum()
PA_df['nonViolPerPop'].isna().sum()
Out[ ]:
1
In [ ]:
PA_df['arsons'].fillna(PA_df['arsons'].mean(), inplace = True)
PA_df['arsonsPerPop'].fillna(PA_df['arsonsPerPop'].mean(), inplace = True)
PA_df['nonViolPerPop'].fillna(PA_df['nonViolPerPop'].mean(), inplace = True)

PA_df.columns[PA_df.isna().any()]

pa_filtered = crimedata_reduced[crimedata_reduced['state'] != 'PA']
crimedata_new = pd.concat([pa_filtered, PA_df], ignore_index=True)

Counter(crimedata_new[crimedata_new['state'] == "PA"].isna().any())
Out[ ]:
Counter({False: 122})

STATE: FL

In [ ]:
FL = crimedata_reduced[crimedata_reduced['state']=='FL']
FL_df= pd.DataFrame(FL)
FL_df.columns[FL_df.isna().any()]
Out[ ]:
Index([], dtype='object')

STATE: CT

In [ ]:
CT = crimedata_reduced[crimedata_reduced['state']=='CT']
CT_df= pd.DataFrame(CT)
CT_df.columns[CT_df.isna().any()]

len(CT_df)
CT_df['ViolentCrimesPerPop'].isna().sum()
Out[ ]:
2
In [ ]:
CT_df['ViolentCrimesPerPop'].fillna(CT_df['ViolentCrimesPerPop'].mean(), inplace = True)

CT_df.columns[CT_df.isna().any()]

ct_filtered = crimedata_reduced[crimedata_reduced['state'] != 'CT']
crimedata_new = pd.concat([ct_filtered, CT_df], ignore_index=True)

Counter(crimedata_new[crimedata_new['state'] == "CT"].isna().any())
Out[ ]:
Counter({False: 122})

STATE: MN

In [ ]:
MN = crimedata_reduced[crimedata_reduced['state']=='MN']
MN_df= pd.DataFrame(MN)
MN_df.columns[CA_df.isna().any()]
Out[ ]:
Index([], dtype='object')

STATE: WI

In [ ]:
WI = crimedata_reduced[crimedata_reduced['state']=='WI']
WI_df= pd.DataFrame(WI)
WI_df.columns[WI_df.isna().any()]

len(WI_df)
WI_df['arsons'].isna().sum()
WI_df['arsonsPerPop'].isna().sum()
WI_df['nonViolPerPop'].isna().sum()
Out[ ]:
1
In [ ]:
WI_df['arsons'].fillna(WI_df['arsons'].mean(), inplace = True)
WI_df['arsonsPerPop'].fillna(WI_df['arsonsPerPop'].mean(), inplace = True)
WI_df['nonViolPerPop'].fillna(WI_df['nonViolPerPop'].mean(), inplace = True)

WI_df.columns[WI_df.isna().any()]

wi_filtered = crimedata_reduced[crimedata_reduced['state'] != 'WI']
crimedata_new = pd.concat([wi_filtered, WI_df], ignore_index=True)

Counter(crimedata_new[crimedata_new['state'] == "WI"].isna().any())
Out[ ]:
Counter({False: 122})

STATE: IN

In [ ]:
IN = crimedata_reduced[crimedata_reduced['state']=='IN']
IN_df= pd.DataFrame(IN)
IN_df.columns[IN_df.isna().any()]
Out[ ]:
Index([], dtype='object')

STATE: NY

In [ ]:
NY = crimedata_reduced[crimedata_reduced['state']=='NY']
NY_df= pd.DataFrame(NY)
NY_df.columns[NY_df.isna().any()]

len(NY_df)
NY_df['arsons'].isna().sum()
NY_df['arsonsPerPop'].isna().sum()
NY_df['nonViolPerPop'].isna().sum()
Out[ ]:
17
In [ ]:
NY_df['arsons'].fillna(NY_df['arsons'].mean(), inplace = True)
NY_df['arsonsPerPop'].fillna(NY_df['arsonsPerPop'].mean(), inplace = True)
NY_df['nonViolPerPop'].fillna(NY_df['nonViolPerPop'].mean(), inplace = True)

NY_df.columns[NY_df.isna().any()]

ny_filtered = crimedata_reduced[crimedata_reduced['state'] != 'NY']
crimedata_new = pd.concat([ny_filtered, NY_df], ignore_index=True)

Counter(crimedata_new[crimedata_new['state'] == "NY"].isna().any())
Out[ ]:
Counter({False: 122})

STATE: NC

In [ ]:
NC = crimedata_reduced[crimedata_reduced['state']=='NC']
NC_df= pd.DataFrame(NC)
NC_df.columns[NC_df.isna().any()]
Out[ ]:
Index([], dtype='object')

STATE: AL

In [ ]:
AL = crimedata_reduced[crimedata_reduced['state']=='AL']
AL_df= pd.DataFrame(AL)
AL_df.columns[AL_df.isna().any()]

len(AL_df)
AL_df['arsons'].isna().sum()
AL_df['arsonsPerPop'].isna().sum()
AL_df['nonViolPerPop'].isna().sum()

len(AL_df)
Out[ ]:
43
In [ ]:
AL_df['arsons'].fillna(AL_df['arsons'].mean(), inplace = True)
AL_df['arsonsPerPop'].fillna(AL_df['arsonsPerPop'].mean(), inplace = True)
AL_df['nonViolPerPop'].fillna(AL_df['nonViolPerPop'].mean(), inplace = True)

AL_df.columns[AL_df.isna().any()]

al_filtered = crimedata_reduced[crimedata_reduced['state'] != 'AL']
crimedata_new = pd.concat([al_filtered, AL_df], ignore_index=True)

Counter(crimedata_new[crimedata_new['state'] == "AL"].isna().any())
Out[ ]:
Counter({False: 122})

STATE: MO

In [ ]:
MO = crimedata_reduced[crimedata_reduced['state']=='MO']
MO_df= pd.DataFrame(MO)
MO_df.columns[MO_df.isna().any()]
Out[ ]:
Index([], dtype='object')

STATE: WA

In [ ]:
WA = crimedata_reduced[crimedata_reduced['state']=='WA']
WA_df= pd.DataFrame(WA)
WA_df.columns[WA_df.isna().any()]

len(WA_df)
WA_df['arsons'].isna().sum()
WA_df['arsonsPerPop'].isna().sum()
WA_df['nonViolPerPop'].isna().sum()
Out[ ]:
1
In [ ]:
WA_df['arsons'].fillna(WA_df['arsons'].mean(), inplace = True)
WA_df['arsonsPerPop'].fillna(WA_df['arsonsPerPop'].mean(), inplace = True)
WA_df['nonViolPerPop'].fillna(WA_df['nonViolPerPop'].mean(), inplace = True)

WA_df.columns[WA_df.isna().any()]

wa_filtered = crimedata_reduced[crimedata_reduced['state'] != 'WA']
crimedata_new = pd.concat([wa_filtered, WA_df], ignore_index=True)

Counter(crimedata_new[crimedata_new['state'] == "WA"].isna().any())
Out[ ]:
Counter({False: 122})

STATE: IL **MISSING VALUES 1.0: All columns which are missing values are missing 100% of its values

In [ ]:
IL = crimedata_reduced[crimedata_reduced['state']=='IL']
IL_df= pd.DataFrame(IL)
IL_df.columns[IL_df.isna().any()]

len(IL_df)
IL_df['rapes'].isna().sum()
IL_df['rapesPerPop'].isna().sum()
IL_df['ViolentCrimesPerPop'].isna().sum()
Out[ ]:
40

STATE: GA

In [ ]:
GA = crimedata_reduced[crimedata_reduced['state']=='GA']
GA_df= pd.DataFrame(GA)
GA_df.columns[GA_df.isna().any()]
Out[ ]:
Index([], dtype='object')

STATE: OK

In [ ]:
OK = crimedata_reduced[crimedata_reduced['state']=='OK']
OK_df= pd.DataFrame(OK)
OK_df.columns[OK_df.isna().any()]
Out[ ]:
Index([], dtype='object')

STATE: TN

In [ ]:
TN = crimedata_reduced[crimedata_reduced['state']=='TN']
TN_df= pd.DataFrame(TN)
TN_df.columns[TN_df.isna().any()]

len(TN_df)
TN_df['arsons'].isna().sum()
TN_df['arsonsPerPop'].isna().sum()
TN_df['nonViolPerPop'].isna().sum()
Out[ ]:
2
In [ ]:
TN_df['arsons'].fillna(TN_df['arsons'].mean(), inplace = True)
TN_df['arsonsPerPop'].fillna(TN_df['arsonsPerPop'].mean(), inplace = True)
TN_df['nonViolPerPop'].fillna(TN_df['nonViolPerPop'].mean(), inplace = True)

TN_df.columns[WA_df.isna().any()]

tn_filtered = crimedata_reduced[crimedata_reduced['state'] != 'TN']
crimedata_new = pd.concat([tn_filtered, TN_df], ignore_index=True)

Counter(crimedata_new[crimedata_new['state'] == "TN"].isna().any())
Out[ ]:
Counter({False: 122})

STATE: VA

In [ ]:
VA = crimedata_reduced[crimedata_reduced['state']=='VA']
VA_df= pd.DataFrame(VA)
VA_df.columns[VA_df.isna().any()]
Out[ ]:
Index([], dtype='object')

STATE: OR

In [ ]:
OR = crimedata_reduced[crimedata_reduced['state']=='OR']
OR_df= pd.DataFrame(OR)
OR_df.columns[OR_df.isna().any()]
Out[ ]:
Index([], dtype='object')

STATE: SC

In [ ]:
SC = crimedata_reduced[crimedata_reduced['state']=='SC']
SC_df= pd.DataFrame(SC)
SC_df.columns[SC_df.isna().any()]
Out[ ]:
Index([], dtype='object')

STATE: KY

In [ ]:
KY = crimedata_reduced[crimedata_reduced['state']=='KY']
KY_df= pd.DataFrame(KY)
KY_df.columns[KY_df.isna().any()]
Out[ ]:
Index([], dtype='object')

STATE: RI

In [ ]:
RI = crimedata_reduced[crimedata_reduced['state']=='RI']
RI_df= pd.DataFrame(RI)
RI_df.columns[RI_df.isna().any()]
Out[ ]:
Index([], dtype='object')

STATE: AR

In [ ]:
AR = crimedata_reduced[crimedata_reduced['state']=='AR']
AR_df= pd.DataFrame(AR)
AR_df.columns[AR_df.isna().any()]
Out[ ]:
Index([], dtype='object')

STATE: CO

In [ ]:
CO = crimedata_reduced[crimedata_reduced['state']=='CO']
CO_df= pd.DataFrame(CO)
CO_df.columns[CO_df.isna().any()]
Out[ ]:
Index([], dtype='object')

STATE: UT

In [ ]:
UT = crimedata_reduced[crimedata_reduced['state']=='UT']
UT_df= pd.DataFrame(UT)
UT_df.columns[UT_df.isna().any()]
Out[ ]:
Index([], dtype='object')

STATE: LA

In [ ]:
LA = crimedata_reduced[crimedata_reduced['state']=='LA']
LA_df= pd.DataFrame(LA)
LA_df.columns[LA_df.isna().any()]

len(LA_df)
LA_df['arsons'].isna().sum()
LA_df['arsonsPerPop'].isna().sum()
LA_df['nonViolPerPop'].isna().sum()
Out[ ]:
3
In [ ]:
LA_df['arsons'].fillna(LA_df['arsons'].mean(), inplace = True)
LA_df['arsonsPerPop'].fillna(LA_df['arsonsPerPop'].mean(), inplace = True)
LA_df['nonViolPerPop'].fillna(LA_df['nonViolPerPop'].mean(), inplace = True)

LA_df.columns[LA_df.isna().any()]

la_filtered = crimedata_reduced[crimedata_reduced['state'] != 'LA']
crimedata_new = pd.concat([la_filtered, LA_df], ignore_index=True)

Counter(crimedata_new[crimedata_new['state'] == "LA"].isna().any())
Out[ ]:
Counter({False: 122})

STATE: NH

In [ ]:
NH = crimedata_reduced[crimedata_reduced['state']=='NH']
NH_df= pd.DataFrame(NH)
NH_df.columns[NH_df.isna().any()]
Out[ ]:
Index([], dtype='object')

STATE: MS

In [ ]:
MS = crimedata_reduced[crimedata_reduced['state']=='MS']
MS_df= pd.DataFrame(MS)
MS_df.columns[MS_df.isna().any()]

len(MS_df)
MS_df['ViolentCrimesPerPop'].isna().sum()
Out[ ]:
1
In [ ]:
MS_df['ViolentCrimesPerPop'].fillna(MS_df['ViolentCrimesPerPop'].mean(), inplace = True)

MS_df.columns[MS_df.isna().any()]

ms_filtered = crimedata_reduced[crimedata_reduced['state'] != 'MS']
crimedata_new = pd.concat([ms_filtered, MS_df], ignore_index=True)

Counter(crimedata_new[crimedata_new['state'] == "MS"].isna().any())
Out[ ]:
Counter({False: 122})

STATE: AZ

In [ ]:
AZ = crimedata_reduced[crimedata_reduced['state']=='AZ']
AZ_df= pd.DataFrame(AZ)
AZ_df.columns[AZ_df.isna().any()]
Out[ ]:
Index([], dtype='object')

STATE: IA **MISSING VALUES 0.85

In [ ]:
IA = crimedata_reduced[crimedata_reduced['state']=='IA']
IA_df= pd.DataFrame(IA)
IA_df.columns[IA_df.isna().any()]

len(IA_df)
IA_df['arsons'].isna().sum()
IA_df['arsonsPerPop'].isna().sum()
IA_df['nonViolPerPop'].isna().sum()
Out[ ]:
17

STATE: ME

In [ ]:
ME = crimedata_reduced[crimedata_reduced['state']=='ME']
ME_df= pd.DataFrame(ME)
ME_df.columns[ME_df.isna().any()]
Out[ ]:
Index([], dtype='object')

STATE: WV

In [ ]:
WV = crimedata_reduced[crimedata_reduced['state']=='WV']
WV_df= pd.DataFrame(WV)
WV_df.columns[WV_df.isna().any()]
Out[ ]:
Index([], dtype='object')

STATE: MD

In [ ]:
MD = crimedata_reduced[crimedata_reduced['state']=='MD']
MD_df= pd.DataFrame(MD)
MD_df.columns[MD_df.isna().any()]

len(MD_df)
MD_df['arsons'].isna().sum()
MD_df['arsonsPerPop'].isna().sum()
MD_df['nonViolPerPop'].isna().sum()
Out[ ]:
3
In [ ]:
MD_df['arsons'].fillna(MD_df['arsons'].mean(), inplace = True)
MD_df['arsonsPerPop'].fillna(MD_df['arsonsPerPop'].mean(), inplace = True)
MD_df['nonViolPerPop'].fillna(MD_df['nonViolPerPop'].mean(), inplace = True)

MD_df.columns[MD_df.isna().any()]

md_filtered = crimedata_reduced[crimedata_reduced['state'] != 'MD']
crimedata_new = pd.concat([md_filtered, MD_df], ignore_index=True)

Counter(crimedata_new[crimedata_new['state'] == "MD"].isna().any())
Out[ ]:
Counter({False: 122})

STATE: NM

In [ ]:
NM = crimedata_reduced[crimedata_reduced['state']=='NM']
NM_df= pd.DataFrame(NM)
NM_df.columns[NM_df.isna().any()]
Out[ ]:
Index([], dtype='object')

STATE: SD

In [ ]:
SD = crimedata_reduced[crimedata_reduced['state']=='SD']
SD_df= pd.DataFrame(SD)
SD_df.columns[SD_df.isna().any()]

len(SD_df)
SD_df['arsons'].isna().sum()
SD_df['arsonsPerPop'].isna().sum()
SD_df['nonViolPerPop'].isna().sum()
Out[ ]:
1
In [ ]:
SD_df['arsons'].fillna(SD_df['arsons'].mean(), inplace = True)
SD_df['arsonsPerPop'].fillna(SD_df['arsonsPerPop'].mean(), inplace = True)
SD_df['nonViolPerPop'].fillna(SD_df['nonViolPerPop'].mean(), inplace = True)

SD_df.columns[SD_df.isna().any()]

sd_filtered = crimedata_reduced[crimedata_reduced['state'] != 'SD']
crimedata_new = pd.concat([sd_filtered, SD_df], ignore_index=True)

Counter(crimedata_new[crimedata_new['state'] == "SD"].isna().any())
Out[ ]:
Counter({False: 122})

STATE: ND

In [ ]:
ND = crimedata_reduced[crimedata_reduced['state']=='ND']
ND_df= pd.DataFrame(ND)
ND_df.columns[ND_df.isna().any()]
Out[ ]:
Index([], dtype='object')

STATE: WY

In [ ]:
WY = crimedata_reduced[crimedata_reduced['state']=='WY']
WY_df= pd.DataFrame(WY)
WY_df.columns[WY_df.isna().any()]
Out[ ]:
Index([], dtype='object')

STATE: ID

In [ ]:
ID = crimedata_reduced[crimedata_reduced['state']=='ID']
ID_df= pd.DataFrame(ID)
ID_df.columns[ID_df.isna().any()]
Out[ ]:
Index([], dtype='object')

STATE: NY

In [ ]:
NY = crimedata_reduced[crimedata_reduced['state']=='NY']
NY_df= pd.DataFrame(NY)
NY_df.columns[NY_df.isna().any()]

len(NY_df)
NY_df['arsons'].isna().sum()
NY_df['arsonsPerPop'].isna().sum()
NY_df['nonViolPerPop'].isna().sum()
Out[ ]:
17
In [ ]:
NY_df['arsons'].fillna(NY_df['arsons'].mean(), inplace = True)
NY_df['arsonsPerPop'].fillna(NY_df['arsonsPerPop'].mean(), inplace = True)
NY_df['nonViolPerPop'].fillna(NY_df['nonViolPerPop'].mean(), inplace = True)

NY_df.columns[NY_df.isna().any()]

ny_filtered = crimedata_reduced[crimedata_reduced['state'] != 'NY']
crimedata_new = pd.concat([ny_filtered, NY_df], ignore_index=True)

Counter(crimedata_new[crimedata_new['state'] == "NY"].isna().any())
Out[ ]:
Counter({False: 122})

STATE: VT **MISSING VALUES 1.0

In [ ]:
VT = crimedata_reduced[crimedata_reduced['state']=='VT']
VT_df= pd.DataFrame(VT)
VT_df.columns[VT_df.isna().any()]

len(VT_df)
VT_df['arsons'].isna().sum()
VT_df['arsonsPerPop'].isna().sum()
VT_df['nonViolPerPop'].isna().sum()

len(VT_df)
Out[ ]:
4

STATE: AK

In [ ]:
AK = crimedata_reduced[crimedata_reduced['state']=='AK']
AK_df= pd.DataFrame(AK)
AK_df.columns[AK_df.isna().any()]
Out[ ]:
Index([], dtype='object')

STATE: KS *MISSING VALUES 1.0

In [ ]:
KS = crimedata_reduced[crimedata_reduced['state']=='KS']
KS_df= pd.DataFrame(KS)
KS_df.columns[KS_df.isna().any()]

len(KS_df)
KS_df['arsons'].isna().sum()
KS_df['arsonsPerPop'].isna().sum()
KS_df['nonViolPerPop'].isna().sum()

len(KS_df)
Out[ ]:
1

STATE: DE

In [ ]:
DE = crimedata_reduced[crimedata_reduced['state']=='DE']
DE_df= pd.DataFrame(DE)
DE_df.columns[DE_df.isna().any()]
Out[ ]:
Index([], dtype='object')

STATE: DC

In [ ]:
DC = crimedata_reduced[crimedata_reduced['state']=='DC']
DC_df= pd.DataFrame(DC)
DC_df.columns[DC_df.isna().any()]
Out[ ]:
Index([], dtype='object')

**There were a number of states that had a significant amount of missing values from some of its columns. These columns were: 'rapes', 'rapesPerPop', 'arsons', 'arsonsPerPop', 'ViolentCrimesPerPop', and 'nonViolPerPop'. The reason these were marked as significant was because they exceeded the missing value threshold of 0.65 of missing values. Some of the states are missing 100% of its values in a specific column.

Because of the nature of the amount of missing values, we are unable to impute these values with the mean. For states missing 100% of the values in a column, there is no mean to impute them with. That being said, the approach to their imputation must be handled differently. I have considered taking the 3-4 surrounding states of this particular state, aggregate them, find the mean of this particular column and then impute the mean into the missing values. This runs the risk of introducing bias into the dataset. The other option would be to drop these rows entirely, which would include dropping 6 entire states.

These states include: MI, AL, IL, IA, VT, KS

In [ ]:
#Check the total number of missing values for each column that was reported to have missing values

crimedata_new['rapes'].isna().sum()
crimedata_new['rapesPerPop'].isna().sum()
crimedata_new['arsons'].isna().sum()
crimedata_new['arsonsPerPop'].isna().sum()
crimedata_new['nonViolPerPop'].isna().sum()
Out[ ]:
80
In [ ]:
#DROP ALL ROWS WITH MISSING VALUES
crimedata_new = crimedata_new.dropna()

I have decided to drop all rows with missing values. Although we are now missing a number of entire states, the bias this may introduce (not having a fully representational sample of the United States), I believe will ultimately affect the predictive models less and introduce considerably less bias into our data as the other option of imputing the mean of surrounding areas would.

Now that we have gone through each state, imputed as necessary, and dropped rows with missing values, we will check the entire dataset for any columns with missing values that we may have missed.

In [ ]:
#Check whole dataset for columns with missing values
crimedata_new.columns[crimedata_new.isna().any()]

len(crimedata_new)
Out[ ]:
1919
In [ ]:
#Check datatypes again to confirm
crimedata_new.dtypes

Check the ranges of our target variables:

In [ ]:
print(crimedata_new['murdPerPop'].min())
print(crimedata_new['murdPerPop'].max())
0.0
91.09
In [ ]:
print(crimedata_new['robbbPerPop'].min())
print(crimedata_new['robbbPerPop'].max())
0.0
2264.13

Here we will deal with CORRELATION: I will run a correlation matrix on the entire dataset to get an idea of which attribute pairs have the highest positive and negative correlations.

In [ ]:
pd.set_option('display.max_rows', None)

#Run correlation matrix
crime_corr = crimedata_new.corr(numeric_only=True)
In [ ]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(30, 10))
sns.heatmap(crime_corr)
Out[ ]:
<AxesSubplot:>
In [ ]:
#Unstack the matrix and sort the values from lowest to highest so we can clearly see the most negatively and positively correlated pairs
sorted_mat = crime_corr.unstack().sort_values()
sorted_mat

Now we will create a dataset for each target variable, giving us 2 seperate datasets. For each dataset, we will examine the correlations between each independent variable and our dependent variable, as a step of general exploration.

We will then run a full correlation matrix on the entire target variable dataset to see which attributes are highly correlated, so we can then drop the necessary columns.

MURDER CATEGORY CORRELATIONS:

In [ ]:
#Create dataset for the specific crime target variable

murders_data = crimedata_new.drop(['rapes', 'rapesPerPop', 'robberies', 'robbbPerPop', 'assaults', 'assaultPerPop', 
                                   'burglaries', 'burglPerPop', 'larcenies', 'larcPerPop', 'autoTheft', 'autoTheftPerPop', 
                                   'arsons', 'arsonsPerPop', 'ViolentCrimesPerPop', 'nonViolPerPop'], axis=1)
murders_data.head()
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct12t29 agePct16t24 agePct65up numbUrban pctUrban medIncome pctWWage pctWFarmSelf pctWInvInc pctWSocSec pctWPubAsst pctWRetire medFamInc perCapInc whitePerCap blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap NumUnderPov PctPopUnderPov PctLess9thGrade PctNotHSGrad PctBSorMore PctUnemployed PctEmploy PctEmplManu PctEmplProfServ PctOccupManu PctOccupMgmtProf MalePctDivorce MalePctNevMarr FemalePctDiv TotalPctDiv PersPerFam PctFam2Par PctKids2Par PctYoungKids2Par PctTeen2Par PctWorkMomYoungKids PctWorkMom NumKidsBornNeverMar PctKidsBornNeverMar NumImmig PctImmigRecent PctImmigRec5 PctImmigRec8 PctImmigRec10 PctRecentImmig PctRecImmig5 PctRecImmig8 PctRecImmig10 PctSpeakEnglOnly PctNotSpeakEnglWell PctLargHouseFam PctLargHouseOccup PersPerOccupHous PersPerOwnOccHous PersPerRentOccHous PctPersOwnOccup PctPersDenseHous PctHousLess3BR MedNumBR HousVacant PctHousOccup PctHousOwnOcc PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctHousNoPhone PctWOFullPlumb OwnOccLowQuart OwnOccMedVal OwnOccHiQuart OwnOccQrange RentLowQ RentMedian RentHighQ RentQrange MedRent MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg NumInShelters NumStreet PctForeignBorn PctBornSameState PctSameHouse85 PctSameCity85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murders murdPerPop
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 21.44 10.93 11.33 11980 100.0 75122 89.24 1.55 70.20 23.62 1.03 18.39 79584 29711 30233 13600 5725 27101 5115.0 22838 227 1.96 5.81 9.90 48.18 2.70 64.55 14.65 28.82 5.49 50.73 3.67 26.38 5.22 4.47 3.22 91.43 90.17 95.78 95.81 44.56 58.88 31 0.36 1277 8.69 13.00 20.99 30.93 0.93 1.39 2.24 3.30 85.68 1.37 4.81 4.17 2.99 3.00 2.84 91.46 0.39 11.06 3 64 98.37 91.01 3.12 37.50 1959 0.00 0.28 215900 262600 326900 111000 685 1001 1001 316 1001 23.8 21.1 14.0 11 0 10.66 53.72 65.29 78.09 89.14 6.5 1845.9 9.63 0.0 0 0.00
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 21.30 10.48 17.18 23123 100.0 47917 78.99 1.11 64.11 35.50 2.75 22.85 55323 20148 20191 18137 0 20074 5250.0 12222 885 3.98 5.61 13.72 29.89 2.43 61.96 12.26 29.28 6.39 37.64 4.23 27.99 6.45 5.42 3.11 86.91 85.33 96.82 86.46 51.14 62.43 43 0.24 1920 5.21 8.65 13.33 22.50 0.43 0.72 1.11 1.87 87.79 1.81 4.25 3.34 2.70 2.83 1.96 89.03 1.01 23.60 3 240 97.15 84.88 0.00 18.33 1958 0.31 0.14 136300 164200 199900 63600 467 560 672 205 627 27.6 20.7 12.5 0 0 8.30 77.17 71.27 90.22 96.12 10.6 2186.7 3.84 0.0 0 0.00
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 25.88 11.01 10.28 29344 100.0 35669 82.00 1.15 55.73 22.25 2.94 14.56 42112 16946 17103 16644 21606 15528 5954.0 8405 1389 4.75 2.80 9.09 30.13 4.01 69.80 15.95 21.52 8.79 32.48 10.10 25.78 14.76 12.55 2.95 78.54 78.85 92.37 75.72 66.08 74.19 164 0.88 1468 16.42 23.98 32.08 35.63 0.82 1.20 1.61 1.78 93.11 1.14 2.97 2.05 2.42 2.69 2.06 64.18 2.03 47.46 3 544 95.68 57.79 0.92 7.54 1976 1.55 0.12 74700 90400 112000 37300 370 428 520 150 484 24.1 21.7 11.6 16 0 5.00 44.77 36.60 61.26 82.85 10.6 2780.9 4.37 0.0 3 8.30
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 32.89 20.04 13.26 140494 100.0 21577 75.78 1.00 41.15 29.31 7.12 14.09 27705 11878 12029 7382 10264 10753 7192.0 8104 23223 17.78 8.76 23.03 20.66 5.72 59.02 14.31 26.83 14.72 23.42 11.40 33.32 14.46 13.04 2.89 71.94 69.79 79.76 75.33 62.96 70.52 1511 1.58 2091 21.33 30.56 38.02 45.48 0.32 0.45 0.57 0.68 96.87 0.60 3.08 1.92 2.28 2.37 2.16 57.81 2.11 53.19 2 5119 91.81 55.50 2.09 26.22 1966 6.13 0.31 37700 53900 73100 35400 215 280 349 134 340 26.4 17.3 11.7 327 4 1.49 64.35 42.29 70.61 85.66 70.4 1995.7 0.97 0.0 7 4.63
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 27.41 12.76 14.42 28700 100.0 42805 79.47 0.39 47.70 30.23 5.41 17.23 50394 18193 18276 17342 21482 12639 21852.0 22594 1126 4.01 4.49 13.89 27.01 4.85 65.42 14.02 27.17 8.50 32.78 5.97 36.05 9.06 7.64 3.14 79.53 79.76 92.05 77.12 65.16 72.81 263 1.18 2637 11.38 16.27 23.93 27.76 1.05 1.49 2.20 2.55 89.98 0.60 5.08 3.46 2.55 2.89 2.09 64.62 1.47 47.35 3 566 95.11 56.96 1.41 34.45 1956 0.69 0.28 155100 179000 215500 60400 463 669 824 361 736 24.4 20.8 12.5 0 0 9.19 77.30 63.45 82.23 93.53 10.9 2643.5 9.62 0.0 0 0.00
In [ ]:
for col in murders_data.columns:
    print(col)
In [ ]:
plt.figure(figsize=(30, 10))
heatmap = sns.heatmap(murders_data.corr(), vmin=-1, vmax=1, annot=True)
C:\Users\radon\AppData\Local\Temp\ipykernel_10788\312952547.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  heatmap = sns.heatmap(murders_data.corr(), vmin=-1, vmax=1, annot=True)
In [ ]:
#Examine the correlations of each column with 'murdPerPop' as our target variable
murdPerPop_corr = murders_data[murders_data.columns[1:]].corr()['murdPerPop'][:]

#Sort the values so we can see the positive and negative correlation clearly
murdPerPop_corr.sort_values()
In [ ]:
#Examine the correlations of each columns and 'murders' as our target variable
murders_corr = murders_data[murders_data.columns[1:]].corr()['murders'][:]

#Sort the values so we can see the positive and negative correlations clearly
murders_corr.sort_values()
In [ ]:
#murd_corr = murders_data.corr(method='spearman')

#Create a correlation matrix for the dataframe category
murd_corr = murders_data.corr(numeric_only=True).abs()

#Select the upper triangle of the matrix, excluding the diagnonal elements
murd_tri = murd_corr.where(np.triu(np.ones(murd_corr.shape),k=1).astype(bool))

#drop the columns with a correlation greater than 0.8 and make a list of those columns named 'murd_drop'
murd_drop = [column for column in murd_tri.columns if any(murd_tri[column] > 0.8)]

#drop the murd_drop columns from the dataframe
murders = murders_data.drop(murders_data[murd_drop], axis=1)
In [ ]:
#for col in murders.columns:
    #print(col)

len(murders.columns)
Out[ ]:
53

ROBEERIES CATEGORY CORRELATIONS:

In [ ]:
robberies_data = crimedata_new.drop(['rapes', 'rapesPerPop', 'murders', 'murdPerPop', 'assaults', 'assaultPerPop', 'burglaries', 'burglPerPop', 'larcenies', 'larcPerPop', 'autoTheft', 'autoTheftPerPop', 'arsons', 'arsonsPerPop', 'ViolentCrimesPerPop', 'nonViolPerPop'], axis=1)
robberies_data.head()
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct12t29 agePct16t24 agePct65up numbUrban pctUrban medIncome pctWWage pctWFarmSelf pctWInvInc pctWSocSec pctWPubAsst pctWRetire medFamInc perCapInc whitePerCap blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap NumUnderPov PctPopUnderPov PctLess9thGrade PctNotHSGrad PctBSorMore PctUnemployed PctEmploy PctEmplManu PctEmplProfServ PctOccupManu PctOccupMgmtProf MalePctDivorce MalePctNevMarr FemalePctDiv TotalPctDiv PersPerFam PctFam2Par PctKids2Par PctYoungKids2Par PctTeen2Par PctWorkMomYoungKids PctWorkMom NumKidsBornNeverMar PctKidsBornNeverMar NumImmig PctImmigRecent PctImmigRec5 PctImmigRec8 PctImmigRec10 PctRecentImmig PctRecImmig5 PctRecImmig8 PctRecImmig10 PctSpeakEnglOnly PctNotSpeakEnglWell PctLargHouseFam PctLargHouseOccup PersPerOccupHous PersPerOwnOccHous PersPerRentOccHous PctPersOwnOccup PctPersDenseHous PctHousLess3BR MedNumBR HousVacant PctHousOccup PctHousOwnOcc PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctHousNoPhone PctWOFullPlumb OwnOccLowQuart OwnOccMedVal OwnOccHiQuart OwnOccQrange RentLowQ RentMedian RentHighQ RentQrange MedRent MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg NumInShelters NumStreet PctForeignBorn PctBornSameState PctSameHouse85 PctSameCity85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn robberies robbbPerPop
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 21.44 10.93 11.33 11980 100.0 75122 89.24 1.55 70.20 23.62 1.03 18.39 79584 29711 30233 13600 5725 27101 5115.0 22838 227 1.96 5.81 9.90 48.18 2.70 64.55 14.65 28.82 5.49 50.73 3.67 26.38 5.22 4.47 3.22 91.43 90.17 95.78 95.81 44.56 58.88 31 0.36 1277 8.69 13.00 20.99 30.93 0.93 1.39 2.24 3.30 85.68 1.37 4.81 4.17 2.99 3.00 2.84 91.46 0.39 11.06 3 64 98.37 91.01 3.12 37.50 1959 0.00 0.28 215900 262600 326900 111000 685 1001 1001 316 1001 23.8 21.1 14.0 11 0 10.66 53.72 65.29 78.09 89.14 6.5 1845.9 9.63 0.0 1.0 8.20
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 21.30 10.48 17.18 23123 100.0 47917 78.99 1.11 64.11 35.50 2.75 22.85 55323 20148 20191 18137 0 20074 5250.0 12222 885 3.98 5.61 13.72 29.89 2.43 61.96 12.26 29.28 6.39 37.64 4.23 27.99 6.45 5.42 3.11 86.91 85.33 96.82 86.46 51.14 62.43 43 0.24 1920 5.21 8.65 13.33 22.50 0.43 0.72 1.11 1.87 87.79 1.81 4.25 3.34 2.70 2.83 1.96 89.03 1.01 23.60 3 240 97.15 84.88 0.00 18.33 1958 0.31 0.14 136300 164200 199900 63600 467 560 672 205 627 27.6 20.7 12.5 0 0 8.30 77.17 71.27 90.22 96.12 10.6 2186.7 3.84 0.0 5.0 21.26
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 25.88 11.01 10.28 29344 100.0 35669 82.00 1.15 55.73 22.25 2.94 14.56 42112 16946 17103 16644 21606 15528 5954.0 8405 1389 4.75 2.80 9.09 30.13 4.01 69.80 15.95 21.52 8.79 32.48 10.10 25.78 14.76 12.55 2.95 78.54 78.85 92.37 75.72 66.08 74.19 164 0.88 1468 16.42 23.98 32.08 35.63 0.82 1.20 1.61 1.78 93.11 1.14 2.97 2.05 2.42 2.69 2.06 64.18 2.03 47.46 3 544 95.68 57.79 0.92 7.54 1976 1.55 0.12 74700 90400 112000 37300 370 428 520 150 484 24.1 21.7 11.6 16 0 5.00 44.77 36.60 61.26 82.85 10.6 2780.9 4.37 0.0 56.0 154.95
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 32.89 20.04 13.26 140494 100.0 21577 75.78 1.00 41.15 29.31 7.12 14.09 27705 11878 12029 7382 10264 10753 7192.0 8104 23223 17.78 8.76 23.03 20.66 5.72 59.02 14.31 26.83 14.72 23.42 11.40 33.32 14.46 13.04 2.89 71.94 69.79 79.76 75.33 62.96 70.52 1511 1.58 2091 21.33 30.56 38.02 45.48 0.32 0.45 0.57 0.68 96.87 0.60 3.08 1.92 2.28 2.37 2.16 57.81 2.11 53.19 2 5119 91.81 55.50 2.09 26.22 1966 6.13 0.31 37700 53900 73100 35400 215 280 349 134 340 26.4 17.3 11.7 327 4 1.49 64.35 42.29 70.61 85.66 70.4 1995.7 0.97 0.0 136.0 90.05
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 27.41 12.76 14.42 28700 100.0 42805 79.47 0.39 47.70 30.23 5.41 17.23 50394 18193 18276 17342 21482 12639 21852.0 22594 1126 4.01 4.49 13.89 27.01 4.85 65.42 14.02 27.17 8.50 32.78 5.97 36.05 9.06 7.64 3.14 79.53 79.76 92.05 77.12 65.16 72.81 263 1.18 2637 11.38 16.27 23.93 27.76 1.05 1.49 2.20 2.55 89.98 0.60 5.08 3.46 2.55 2.89 2.09 64.62 1.47 47.35 3 566 95.11 56.96 1.41 34.45 1956 0.69 0.28 155100 179000 215500 60400 463 669 824 361 736 24.4 20.8 12.5 0 0 9.19 77.30 63.45 82.23 93.53 10.9 2643.5 9.62 0.0 9.0 30.44
In [ ]:
robberies_data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1919 entries, 0 to 2214
Columns: 106 entries, communityname to robbbPerPop
dtypes: float64(77), int64(27), object(2)
memory usage: 1.6+ MB
In [ ]:
plt.figure(figsize=(30, 10))
sns.heatmap(robberies_data.corr(), vmin=-1, vmax=1, annot=True)
C:\Users\radon\AppData\Local\Temp\ipykernel_10788\1798950277.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  sns.heatmap(robberies_data.corr(), vmin=-1, vmax=1, annot=True)
Out[ ]:
<AxesSubplot:>
In [ ]:
#Examine the correlations of each columns and 'robbbPerPop' as our target variable
robbbPerPop_corr = robberies_data[robberies_data.columns[1:]].corr()['robbbPerPop'][:]

#Sort the values so we can see the positive and negative correlations clearly
robbbPerPop_corr.sort_values()
In [ ]:
#Examine the correlations of each columns and 'robberies' as our target variable
robberies_corr = robberies_data[robberies_data.columns[1:]].corr()['robberies'][:]

#Sort the values so we can see the positive and negative correlations clearly
robberies_corr.sort_values()
In [ ]:
#Create a correlation matrix for the dataframe category
robb_corr = robberies_data.corr(numeric_only=True).abs()

#Select the upper triangle of the matrix, excluding the diagnonal elements
robb_tri = robb_corr.where(np.triu(np.ones(robb_corr.shape),k=1).astype(bool))

#drop the columns with a correlation greater than 0.8 and make a list of those columns
robb_drop = [column for column in robb_tri.columns if any(robb_tri[column] > 0.8)]

#drop the columns in the previous list from the dataframe
robberies = robberies_data.drop(robberies_data[robb_drop], axis=1)

robberies.head()
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn robbbPerPop
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 8.20
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 21.26
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 154.95
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 90.05
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 30.44
In [ ]:
robberies[robberies.columns[1:]].corr()['robbbPerPop'][:]
In [ ]:
murders_copy[murders_copy.columns[1:]].corr()['murdPerPop'][:]

Here I have chosen to make a copy of the original murders dataset. I did not make a copy before altering the original dataset, so here I have backtracked in order to do this.

In [ ]:
#Create a correlation matrix for the dataframe category
murd_corr2 = murders_data.corr(numeric_only=True).abs()

#Select the upper triangle of the matrix, excluding the diagnonal elements
murd_tri2 = murd_corr2.where(np.triu(np.ones(murd_corr.shape),k=1).astype(bool))

#drop the columns with a correlation greater than 0.8 and make a list of those columns named 'murd_drop'
murd_drop2 = [column for column in murd_tri2.columns if any(murd_tri2[column] > 0.8)]

#drop the murd_drop columns from the dataframe
murders_copy = murders_data.drop(murders_data[murd_drop2], axis=1)

murders_copy.head()
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00

We will now go through the remaining attributes to review the column summaries, as well as the distributions in order to see how balanced they are. If they are normally or close to normally distributed, we will normalize the column with a range of 0-1. If they are not normally distributed and clearly skewed/unbalanced, the column will be turned to a categorical variable with different levels. We will turn our target variables into a CATEGORICAL ATTRIBUTE. The quartiles within the column summary will establish the levels that are chosen. Lastly, we will run one more method of feature selection after normalization, INFORMATION GAIN. We will have to decide on either ENTROPY or GINI. Then we will drop any further columns if needed. This will serve as preparation of our dataset for the regression and classification models.

***RUN INFORMATION GAIN (ENTROPY OR GINI) I have attempted to run information gain on top of feature selection via correlation. I was unable to assess the issue effectively and will not be moving forward without it for now. I may revisit this at a later time.

In [ ]:
#Examine the summaries of each attribute 
murders.describe()
Out[ ]:
population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop
count 1.919000e+03 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000 1919.000000
mean 5.222536e+04 2.710662 9.353835 83.515268 2.804054 8.673700 14.404299 12.009489 69.758046 33901.321001 0.884648 43.464356 6.773955 16.119192 11573.294424 12301.949453 14313.012507 9451.968213 11022.774883 11.685935 9.435503 22.991496 17.795654 24.504070 13.709792 9.176597 30.687061 73.977910 60.309349 13.662590 1.168708 65.509693 45.854846 2.623241 92.690594 2.780214 35.128588 1962.744659 0.433111 201.729026 26.362064 21.300834 13.033299 60.188791 51.336967 87.711407 28.031162 2815.384575 3.093622 1.007004 5.927712
std 2.051729e+05 0.347113 13.914794 16.325927 4.720112 15.388139 4.479374 4.847315 44.375875 13478.160054 0.688882 12.757634 4.474456 4.588814 9336.714385 15477.135843 9635.116167 8046.208476 5757.066299 8.451142 6.864861 12.521400 8.086259 6.650632 6.399682 2.806209 8.027055 10.302549 7.904721 9.680139 1.663076 14.131028 14.076144 0.519988 5.265741 3.447570 14.026619 11.160290 0.431390 87.361065 2.907053 2.948489 1.466011 17.074383 10.718396 7.488716 116.924373 2949.803130 4.957666 2.927262 9.038823
min 1.000500e+04 1.600000 0.000000 2.680000 0.060000 0.120000 4.580000 1.660000 0.000000 12908.000000 0.000000 9.020000 0.500000 3.460000 0.000000 0.000000 0.000000 0.000000 0.000000 0.640000 0.200000 1.630000 2.050000 8.690000 1.370000 2.130000 12.060000 32.240000 24.420000 0.000000 0.000000 13.930000 3.060000 1.000000 37.470000 0.000000 3.120000 1939.000000 0.000000 0.000000 14.900000 14.100000 10.100000 6.750000 11.830000 32.830000 0.900000 10.000000 0.000000 0.000000 0.000000
25% 1.430700e+04 2.490000 0.930000 75.885000 0.625000 0.950000 12.210000 8.875000 0.000000 23689.500000 0.470000 34.285000 3.370000 13.045000 6749.500000 6346.500000 8537.500000 5584.000000 7299.500000 4.650000 4.760000 14.080000 11.985000 20.080000 9.035000 7.130000 25.475000 67.710000 55.290000 6.965000 0.190000 56.435000 37.745000 2.000000 90.950000 0.770000 24.690000 1956.000000 0.170000 140.000000 24.400000 19.200000 11.900000 48.365000 44.635000 84.725000 7.200000 1177.650000 0.370000 0.000000 0.000000
50% 2.258000e+04 2.660000 3.070000 89.610000 1.260000 2.430000 13.620000 11.860000 100.000000 31231.000000 0.690000 42.440000 5.660000 15.740000 9787.000000 9874.000000 12408.000000 8182.000000 9703.000000 9.500000 7.900000 19.650000 16.670000 23.360000 13.020000 9.220000 29.080000 74.780000 60.610000 12.420000 0.540000 64.960000 46.790000 3.000000 93.990000 1.730000 34.470000 1964.000000 0.330000 175.000000 26.200000 21.400000 12.800000 62.080000 51.890000 89.620000 13.600000 2007.600000 1.240000 0.000000 2.420000
75% 4.308450e+04 2.855000 11.390000 95.965000 2.815000 8.910000 15.390000 14.545000 100.000000 41464.500000 1.100000 52.270000 9.100000 18.845000 14519.000000 14790.500000 17413.500000 11407.500000 13424.000000 17.040000 12.145000 28.910000 22.730000 27.605000 17.390000 11.130000 33.470000 81.775000 65.585000 18.035000 1.410000 75.390000 54.210000 3.000000 95.920000 3.510000 44.275000 1971.000000 0.560000 242.500000 28.100000 23.300000 13.800000 74.170000 58.650000 92.730000 25.350000 3284.800000 3.450000 0.000000 8.610000
max 7.322564e+06 5.280000 96.670000 99.630000 57.460000 95.290000 54.400000 52.770000 100.000000 123625.000000 6.530000 89.040000 26.920000 45.510000 212120.000000 480000.000000 106165.000000 137000.000000 54648.000000 48.820000 49.890000 73.630000 50.030000 62.670000 44.270000 19.090000 76.320000 93.600000 87.970000 64.290000 13.710000 96.590000 95.340000 4.000000 99.000000 39.890000 82.130000 1987.000000 5.330000 803.000000 35.100000 32.700000 23.400000 93.140000 78.560000 99.900000 3569.800000 44229.900000 54.330000 48.440000 91.090000
In [ ]:
#Check the skew values of all attributes, sort them in ascending order
murders.skew(numeric_only=True).sort_values()
In [ ]:
#Find square root to establish how many bins are appropriate
import math
len(murders)
math.sqrt(2215)

#Plot the distribution
import matplotlib.pyplot as plt
murders['population'].hist(bins=20)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders['population'].min())
print(murders['population'].max())

#Determine bins based on quantiles
pop_bins = pd.qcut(murders['population'], q=5)
#Check the value counts of each bin to ensure they are balanced
pop_bins.value_counts()

#Create bin labels
pop_bin_labels = ['10000-13500', '13500-19000', '19000-29000', '29000-51500', '515000-7500000']
#Create bins
pop_bin = [10000, 13500, 19000, 29000, 51500, 7500000] 
#Add new category
murders['pop_bins'] = pd.cut(murders['population'], bins=pop_bin, labels=pop_bin_labels)

murders.head()
10005
7322564
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000
In [ ]:
murders['pop_bins'].value_counts()
Out[ ]:
10000-13500       410
13500-19000       387
515000-7500000    384
19000-29000       382
29000-51500       356
Name: pop_bins, dtype: int64
In [ ]:
murders['householdsize'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()

#SHAPIRO TEST HERE
In [ ]:
# apply normalization technique, using a range of [0,1]
murders['householdsize_bins'] = (murders['householdsize'] - murders['householdsize'].min()) / (murders['householdsize'].max() - murders['householdsize'].min())    
  
murders.head()
In [ ]:
murders['racepctblack'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders['racepctblack'].min())
print(murders['racepctblack'].max())

#Determine bins based on quantiles
racepctblack_bins = pd.qcut(murders['racepctblack'], q=4)
#Check the value counts of each bin to ensure they are balanced
racepctblack_bins.value_counts()

#Create bin labels
racepctblack_bin_labels = ['0-0.8%', '0.9-2.8%', '2.9-11.1%', '11.2-97%']

#Create new bin category
murders['racepctblack_bins'] = pd.qcut(murders['racepctblack'],
                              q=4,
                              labels=racepctblack_bin_labels)

murders.head()
0.0
96.67
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8%
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8%
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8%
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8%
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8%
In [ ]:
murders['racepctblack_bins'].value_counts()
Out[ ]:
0-0.8%       484
11.2-97%     480
2.9-11.1%    479
0.9-2.8%     476
Name: racepctblack_bins, dtype: int64
In [ ]:
murders['racePctWhite'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders['racePctWhite'].min())
print(murders['racePctWhite'].max())

#Determine bins based on quantiles
racePctWhite_= pd.qcut(murders['racePctWhite'], q=4)
#Check the value counts of each bin to ensure they are balanced
racePctWhite_.value_counts()

#Create bin labels
racePctWhite_labels = ['0.0-75%','75-90%', '90-96%', '96-100%']
#Create bins
racePctWhite_bin = [0, 75, 90, 96, 100]
#Add new category
#murders_backup = murders
murders['racePctWhite_bins'] = pd.cut(murders['racePctWhite'], bins=racePctWhite_bin, labels=racePctWhite_labels)

#murders.head()
2.68
99.63
In [ ]:
murders['racePctWhite_bins'].value_counts()
Out[ ]:
75-90%     518
96-100%    474
90-96%     467
0.0-75%    460
Name: racePctWhite_bins, dtype: int64
In [ ]:
murders['racePctAsian'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders['racePctAsian'].min())
print(murders['racePctAsian'].max())


#Determine bins based on quantiles
racePctAsian_bins = pd.qcut(murders['racePctAsian'], q=4)
#Check the value counts of each bin to ensure they are balanced
racePctAsian_bins.value_counts()

#Create bin labels
racePctAsian_bin_labels = ['0.0-0.6%', '0.6-1.2%', '1.2-2.6%', '2.7-57.5%']

#Create new bin category
murders_backup = murders
murders['racePctAsian_bins'] = pd.qcut(murders['racePctAsian'],
                              q=4,
                              labels=racePctAsian_bin_labels)
#murders.head()
0.06
57.46
In [ ]:
murders['racePctAsian_bins'].value_counts()
Out[ ]:
0.0-0.6%     480
0.6-1.2%     480
2.7-57.5%    480
1.2-2.6%     479
Name: racePctAsian_bins, dtype: int64
In [ ]:
murders[' racePctHisp'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders[' racePctHisp'].min())
print(murders[' racePctHisp'].max())

#Determine bins based on quantiles
racePctHisp_bins = pd.qcut(murders[' racePctHisp'], q=4)
#Check the value counts of each bin to ensure they are balanced
racePctHisp_bins.value_counts()

#Create bin labels
racePctHisp_bin_labels = ['0.1-0.9%', '0.9-2.2%', '2.2-7.8%', '7.8-95.3%']

#Create new bin category
murders['racePctHisp_bins'] = pd.qcut(murders[' racePctHisp'],
                              q=4,
                              labels=racePctHisp_bin_labels)
murders.head()
In [ ]:
murders['racePctHisp_bins'].value_counts()
Out[ ]:
0.1-0.9%     485
2.2-7.8%     480
7.8-95.3%    479
0.9-2.2%     475
Name: racePctHisp_bins, dtype: int64
In [ ]:
murders['agePct12t21'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
# apply normalization technique, using a range of [0,1]
murders['agePct12t21_norm'] = (murders['agePct12t21'] - murders['agePct12t21'].min()) / (murders['agePct12t21'].max() - murders['agePct12t21'].min())    
  
murders.head()
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_bins PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_bins PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm PctHousLess3BR_norm MedNumBR_bins PctHousOccup_bins PctVacantBoarded_bins PctVacMore6Mos_norm MedYrHousBuilt_bins PctWOFullPlumb_bins RentQrange_bins MedRentPctHousInc_norm MedOwnCostPctInc_norm MedOwnCostPctIncNoMtg_bins PctBornSameState_bins PctSameHouse85_norm PctSameState85_bins LandArea_bins PopDens_bins PctUsePubTrans_bins murdPerPop_class_target murdPerPop_reg_target householdsize_norm agePct12t21_norm
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 0.407609 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 28-65% 0.096037 0.090802 25-29% 82-95% 0.316916 7-12% 0.5-1.3% 0.937939 0.086693 2.5-4 96-99% 1.75-4% 0.435135 1956-1964 0.16-0.3% 230-805 0.440594 0.376344 13-14% 50-65% 0.801139 85-90% 0-7 1200-2000 0.5-50% No 0.000000 0.407609 0.158370
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 0.331522 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 28-65% 0.117016 0.123821 25-29% 82-95% 0.420456 0-7% 0.2-0.5% 0.908541 0.222583 2.5-4 96-99% 0-0.75% 0.192507 1956-1964 0-0.16% 170-230 0.628713 0.354839 12-13% 75-95% 0.890754 93-100% 7-14 2000-3300 0.5-50% No 0.000000 0.331522 0.129065
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 0.225543 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 20-23% 0.172960 0.469929 25-29% 75-82% 0.655547 12-18% 0.5-1.3% 0.607912 0.481144 2.5-4 94-96% 0.75-1.75% 0.055942 1971-1987 0-0.16% 140-170 0.455446 0.408602 10-12% 0-50% 0.371197 32-85% 7-14 2000-3300 0.5-50% Yes 0.091119 0.225543 0.136090
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 0.230978 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 23-28% 0.311189 0.546580 29-34% 67-70% 0.606452 18-65% 0.2-0.5% 0.530849 0.543238 0-2.5 91-94% 1.75-4% 0.292368 1964-1971 0.3-0.5% 0-140 0.569307 0.172043 10-12% 50-65% 0.456466 85-90% 26-4000 1200-2000 0.5-50% Yes 0.050829 0.230978 0.271176
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 0.271739 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 23-28% 0.166200 0.226415 34-77% 75-82% 0.641070 7-12% 0.5-1.3% 0.613235 0.479952 2.5-4 94-96% 0.75-1.75% 0.396532 1939-1956 0.16-0.3% 230-805 0.470297 0.360215 12-13% 75-95% 0.773565 93-100% 7-14 2000-3300 0.5-50% No 0.000000 0.271739 0.132276
In [ ]:
murders = murders.drop(['householdsize_bins', 'agePct12t21_bins'], axis=1)
In [ ]:
murders[' agePct65up'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
# apply normalization technique, using a range of [0,1]
murders['agePct65up_norm'] = (murders[' agePct65up'] - murders[' agePct65up'].min()) / (murders[' agePct65up'].max() - murders[' agePct65up'].min())    
  
murders.head()
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_bins PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_bins PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm PctHousLess3BR_norm MedNumBR_bins PctHousOccup_bins PctVacantBoarded_bins PctVacMore6Mos_norm MedYrHousBuilt_bins PctWOFullPlumb_bins RentQrange_bins MedRentPctHousInc_norm MedOwnCostPctInc_norm MedOwnCostPctIncNoMtg_bins PctBornSameState_bins PctSameHouse85_norm PctSameState85_bins LandArea_bins PopDens_bins PctUsePubTrans_bins murdPerPop_class_target murdPerPop_reg_target householdsize_norm agePct12t21_norm agePct65up_norm
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 28-65% 0.096037 0.090802 25-29% 82-95% 0.316916 7-12% 0.5-1.3% 0.937939 0.086693 2.5-4 96-99% 1.75-4% 0.435135 1956-1964 0.16-0.3% 230-805 0.440594 0.376344 13-14% 50-65% 0.801139 85-90% 0-7 1200-2000 0.5-50% No 0.000000 0.407609 0.158370 0.189200
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 28-65% 0.117016 0.123821 25-29% 82-95% 0.420456 0-7% 0.2-0.5% 0.908541 0.222583 2.5-4 96-99% 0-0.75% 0.192507 1956-1964 0-0.16% 170-230 0.628713 0.354839 12-13% 75-95% 0.890754 93-100% 7-14 2000-3300 0.5-50% No 0.000000 0.331522 0.129065 0.303659
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 20-23% 0.172960 0.469929 25-29% 75-82% 0.655547 12-18% 0.5-1.3% 0.607912 0.481144 2.5-4 94-96% 0.75-1.75% 0.055942 1971-1987 0-0.16% 140-170 0.455446 0.408602 10-12% 0-50% 0.371197 32-85% 7-14 2000-3300 0.5-50% Yes 0.091119 0.225543 0.136090 0.168656
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 23-28% 0.311189 0.546580 29-34% 67-70% 0.606452 18-65% 0.2-0.5% 0.530849 0.543238 0-2.5 91-94% 1.75-4% 0.292368 1964-1971 0.3-0.5% 0-140 0.569307 0.172043 10-12% 50-65% 0.456466 85-90% 26-4000 1200-2000 0.5-50% Yes 0.050829 0.230978 0.271176 0.226961
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 23-28% 0.166200 0.226415 34-77% 75-82% 0.641070 7-12% 0.5-1.3% 0.613235 0.479952 2.5-4 94-96% 0.75-1.75% 0.396532 1939-1956 0.16-0.3% 230-805 0.470297 0.360215 12-13% 75-95% 0.773565 93-100% 7-14 2000-3300 0.5-50% No 0.000000 0.271739 0.132276 0.249658
In [ ]:
murders = murders.drop(['agePct65up_bins'], axis=1)
In [ ]:
murders[' pctUrban'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders[' pctUrban'].min())
print(murders[' pctUrban'].max())

#Determine bins based on quantiles
print(murders[' pctUrban'].describe())

#Create bin labels
pctUrban_bin_labels = ['0.0%', '0.1-99%', '100%']
#Create bins
pctUrban_bin = [-1, 0.1, 99, 100]
#Add new category
murders_backup = murders
murders['pctUrban_bins'] = pd.cut(murders[' pctUrban'], bins=pctUrban_bin, labels=pctUrban_bin_labels)
murders.head()
0.0
100.0
count    1919.000000
mean       69.758046
std        44.375875
min         0.000000
25%         0.000000
50%       100.000000
75%       100.000000
max       100.000000
Name:  pctUrban, dtype: float64
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100%
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100%
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100%
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100%
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100%
In [ ]:
murders['pctUrban_bins'].value_counts()

#This distribution was not able to be balanced in any meaningful way
#I decided to leave the values as is, changing them to a binary yes/no would have been unbalanced and would have lost more information
Out[ ]:
100%       1158
0.0%        532
0.1-99%     229
Name: pctUrban_bins, dtype: int64
In [ ]:
murders[' medIncome'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders[' medIncome'].min())
print(murders[' medIncome'].max())

#Determine bins based on quantiles
medIncome_ = pd.qcut(murders[' medIncome'], q=4)
#Check the value counts of each bin to ensure they are balanced
medIncome_.value_counts()

#Create bin labels
medIncome_labels = ['$8,000-$24,000', '$24,000-$32,000', '$32,000-$42,000', '42,000-$125,000']
#Create bins
medIncome_bin = [8865, 24000, 32000, 42000, 123625]
#Add new category
murders_backup = murders
murders['medIncome_bins'] = pd.cut(murders[' medIncome'], bins=medIncome_bin, labels=medIncome_labels)
murders.head()
12908
123625
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000
In [ ]:
murders['medIncome_bins'].value_counts()
Out[ ]:
$24,000-$32,000    511
$8,000-$24,000     500
42,000-$125,000    458
$32,000-$42,000    450
Name: medIncome_bins, dtype: int64
In [ ]:
murders['pctWFarmSelf'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders['pctWFarmSelf'].min())
print(murders['pctWFarmSelf'].max())

#Determine bins based on quantiles
murders['pctWFarmSelf'].describe()

#Create bin labels
pctWFarmSelf_labels = ['0-0.5%', '0.5-0.7%', '0.7-1.0%', '1.0-7.0%']
#Create bins
pctWFarmSelf_bin = [-1, 0.5, 0.7, 1, 7]
#Add new category
murders['pctWFarmSelf_bins'] = pd.cut(murders['pctWFarmSelf'], bins=pctWFarmSelf_bin, labels=pctWFarmSelf_labels)
murders.head()
0.0
6.53
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0%
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0%
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0%
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0%
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5%
In [ ]:
murders['pctWFarmSelf_bins'].value_counts()
Out[ ]:
0-0.5%      578
1.0-7.0%    553
0.5-0.7%    401
0.7-1.0%    387
Name: pctWFarmSelf_bins, dtype: int64
In [ ]:
murders['pctWInvInc'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
# apply normalization technique, using a range of [0,1]
murders['pctWInvInc_norm'] = (murders['pctWInvInc'] - murders['pctWInvInc'].min()) / (murders['pctWInvInc'].max() - murders['pctWInvInc'].min())    
  
murders.head()
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379
In [ ]:
murders[' pctWPubAsst'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders[' pctWPubAsst'].min())
print(murders[' pctWPubAsst'].max())

#Determine bins based on quantiles
pctWPubAsst_ = pd.qcut(murders[' pctWPubAsst'], q=4)
#Check the value counts of each bin to ensure they are balanced
pctWPubAsst_.value_counts()

#Create bin labels
pctWPubAsst_labels = ['0-3%', '3-5%', '5-8%', '8-45%']
#Create bins
pctWPubAsst_bin = [0, 3, 5, 8.5, 45]
#Add new category
murders['pctWPubAsst_bins'] = pd.cut(murders[' pctWPubAsst'], bins=pctWPubAsst_bin, labels=pctWPubAsst_labels)
murders.head()
0.5
26.92
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3%
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3%
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3%
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8%
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8%
In [ ]:
murders['pctWPubAsst_bins'].value_counts()
Out[ ]:
8-45%    552
5-8%     532
3-5%     441
0-3%     394
Name: pctWPubAsst_bins, dtype: int64
In [ ]:
murders['pctWRetire'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
# apply normalization technique, using a range of [0,1]
murders['pctWRetire_norm'] = (murders['pctWRetire'] - murders['pctWRetire'].min()) / (murders['pctWRetire'].max() - murders['pctWRetire'].min())    
  
murders.head()
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467
In [ ]:
murders['blackPerCap'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders['blackPerCap'].min())
print(murders['blackPerCap'].max())

#Determine bins based on quantiles
blackPerCap_ = pd.qcut(murders['blackPerCap'], q=4)
#Check the value counts of each bin to ensure they are balanced
blackPerCap_.value_counts()

#Create bin labels
blackPerCap_labels = ['0-7000', '6500-10000', '10000-15000', '15000-250000']
#Create bins
blackPerCap_bin = [-1, 7000, 10000, 15000, 250000]
#Add new catrgory
murders['blackPerCap_bins'] = pd.cut(murders['blackPerCap'], bins=blackPerCap_bin, labels=blackPerCap_labels)
murders.head()
0
212120
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000
In [ ]:
murders['blackPerCap_bins'].value_counts()
Out[ ]:
0-7000          536
10000-15000     498
6500-10000      449
15000-250000    436
Name: blackPerCap_bins, dtype: int64
In [ ]:
murders['indianPerCap'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders['indianPerCap'].min())
print(murders['indianPerCap'].max())

#Determine bins based on quantiles
indianPerCap_ = pd.qcut(murders['indianPerCap'], q=4)
#Check the value counts of each bin to ensure they are balanced
indianPerCap_.value_counts()

#Create bin labels
indianPerCap_labels = ['0-6500', '6500-10000', '10000-15000', '15000-500000']
#Create bins
indianPerCap_bin = [-1, 6500, 10000, 15000, 500000]
#Add new category
murders['indianPerCap_bins'] = pd.cut(murders['indianPerCap'], bins=indianPerCap_bin, labels=indianPerCap_labels)
murders.head()
0
480000
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000
In [ ]:
murders['indianPerCap_bins'].value_counts()
Out[ ]:
0-6500          497
10000-15000     481
6500-10000      474
15000-500000    467
Name: indianPerCap_bins, dtype: int64
In [ ]:
murders['AsianPerCap'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders['AsianPerCap'].min())
print(murders['AsianPerCap'].max())

#Determine bins based on quantiles
AsianPerCap_ = pd.qcut(murders['AsianPerCap'], q=4)
#Check the value counts of each bin to ensure they are balanced
AsianPerCap_.value_counts()

#Create bin labels
AsianPerCap_labels = ['0-8500', '8500-12500', '12500-17500', '17500-106500']
#Create bins
AsianPerCap_bin = [-1, 8500, 12500, 17500, 106500]
#Add new category
murders['AsianPerCap_bins'] = pd.cut(murders['AsianPerCap'], bins=AsianPerCap_bin, labels=AsianPerCap_labels)
murders.head()
0
106165
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500
In [ ]:
murders['AsianPerCap_bins'].value_counts()
Out[ ]:
8500-12500      501
17500-106500    477
0-8500          476
12500-17500     465
Name: AsianPerCap_bins, dtype: int64
In [ ]:
murders['OtherPerCap'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders['OtherPerCap'].min())
print(murders['OtherPerCap'].max())

#Determine bins based on quantiles
OtherPerCap_ = pd.qcut(murders['OtherPerCap'], q=4)
#Check the value counts of each bin to ensure they are balanced
OtherPerCap_.value_counts()

#Create bin labels
OtherPerCap_labels = ['0-5500', '5500-8000', '8000-11500', '11500-137000']
#Create bins
OtherPerCap_bin = [-1, 5500, 8000, 11500, 137000]
#Add new category
murders['OtherPerCap_bins'] = pd.cut(murders['OtherPerCap'], bins=OtherPerCap_bin, labels=OtherPerCap_labels)
murders.head()
0.0
137000.0
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000
In [ ]:
murders['OtherPerCap_bins'].value_counts()
Out[ ]:
8000-11500      531
11500-137000    472
0-5500          469
5500-8000       447
Name: OtherPerCap_bins, dtype: int64
In [ ]:
murders['HispPerCap'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders['HispPerCap'].min())
print(murders['HispPerCap'].max())

#Determine bins based on quantiles
HispPerCap_ = pd.qcut(murders['HispPerCap'], q=4)
#Check the value counts of each bin to ensure they are balanced
HispPerCap_.value_counts()

#Create bin labels
HispPerCap_labels = ['0-7500', '7500-1000', '1000-13500', '13500-55000']
#Create bins
HispPerCap_bin = [-1, 7500, 10000, 13500, 55000]
#Add new category
murders['HispPerCap_bins'] = pd.cut(murders['HispPerCap'], bins=HispPerCap_bin, labels=HispPerCap_labels)
murders.head()
0
54648
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000
In [ ]:
murders['HispPerCap_bins'].value_counts()
Out[ ]:
0-7500         522
7500-1000      487
13500-55000    469
1000-13500     441
Name: HispPerCap_bins, dtype: int64
In [ ]:
murders['PctPopUnderPov'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders['PctPopUnderPov'].min())
print(murders['PctPopUnderPov'].max())

#Determine bins based on quantiles
PctPopUnderPov_ = pd.qcut(murders['PctPopUnderPov'], q=4)
#Check the value counts of each bin to ensure they are balanced
PctPopUnderPov_.value_counts()

#Create bin labels
PctPopUnderPov_labels = ['0-5%', '5-10%', '10-17%', '17-60%']
#Create bins
PctPopUnderPov_bin = [0, 5, 10, 17, 60]
#Add new category
murders['PctPopUnderPov_bins'] = pd.cut(murders['PctPopUnderPov'], bins=PctPopUnderPov_bin, labels=PctPopUnderPov_labels)
murders.head()
0.64
48.82
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5%
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5%
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5%
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60%
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5%
In [ ]:
murders['PctPopUnderPov_bins'].value_counts()
Out[ ]:
0-5%      522
17-60%    483
5-10%     477
10-17%    437
Name: PctPopUnderPov_bins, dtype: int64
In [ ]:
murders['PctLess9thGrade'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders['PctLess9thGrade'].min())
print(murders['PctLess9thGrade'].max())

#Determine bins based on quantiles
PctLess9thGrade_ = pd.qcut(murders['PctLess9thGrade'], q=4)
#Check the value counts of each bin to ensure they are balanced
PctLess9thGrade_.value_counts()

#Create bin labels
PctLess9thGrade_labels = ['0-5%', '5-8%', '8-12%', '12-50%']
#Create bins
PctLess9thGrade_bin = [0, 5, 8, 12, 50]
#Add new category
murders['PctLess9thGrade_bins'] = pd.cut(murders['PctLess9thGrade'], bins=PctLess9thGrade_bin, labels=PctLess9thGrade_labels)
murders.head()
0.2
49.89
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8%
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8%
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5%
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12%
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5%
In [ ]:
murders['PctLess9thGrade_bins'].value_counts()
Out[ ]:
0-5%      516
12-50%    488
5-8%      462
8-12%     453
Name: PctLess9thGrade_bins, dtype: int64
In [ ]:
murders['PctBSorMore'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders['PctBSorMore'].min())
print(murders['PctBSorMore'].max())

#Determine bins based on quantiles
PctBSorMore_ = pd.qcut(murders['PctBSorMore'], q=4)
#Check the value counts of each bin to ensure they are balanced
PctBSorMore_.value_counts()

#Create bin labels
PctBSorMore_labels = ['0-14', '14-19', '19-30', '30-80']
#Create bins
PctBSorMore_bin = [0, 14, 19, 30, 80]
#Add new category
murders['PctBSorMore_bins'] = pd.cut(murders['PctBSorMore'], bins=PctBSorMore_bin, labels=PctBSorMore_labels)
murders.head()
1.63
73.63
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30
In [ ]:
murders['PctBSorMore_bins'].value_counts()
Out[ ]:
19-30    565
0-14     473
14-19    443
30-80    438
Name: PctBSorMore_bins, dtype: int64
In [ ]:
murders['PctEmplManu'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
# apply normalization technique, using a range of [0,1]
murders['PctEmplManu_norm'] = (murders['PctEmplManu'] - murders['PctEmplManu'].min()) / (murders['PctEmplManu'].max() - murders['PctEmplManu'].min())    
  
murders.head()
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479
In [ ]:
murders['PctEmplProfServ'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
# apply normalization technique, using a range of [0,1]
murders['PctEmplProfServ_norm'] = (murders['PctEmplProfServ'] - murders['PctEmplProfServ'].min()) / (murders['PctEmplProfServ'].max() - murders['PctEmplProfServ'].min())    
  
murders.head()
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_bins PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_bins PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm PctHousLess3BR_norm MedNumBR_bins PctHousOccup_bins PctVacantBoarded_bins PctVacMore6Mos_norm MedYrHousBuilt_bins PctWOFullPlumb_bins RentQrange_bins MedRentPctHousInc_norm MedOwnCostPctInc_norm MedOwnCostPctIncNoMtg_bins PctBornSameState_bins PctSameHouse85_norm PctSameState85_bins LandArea_bins PopDens_bins PctUsePubTrans_bins murdPerPop_class_target murdPerPop_reg_target householdsize_norm agePct12t21_norm agePct65up_norm PctEmplProfServ_norm
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 28-65% 0.096037 0.090802 25-29% 82-95% 0.316916 7-12% 0.5-1.3% 0.937939 0.086693 2.5-4 96-99% 1.75-4% 0.435135 1956-1964 0.16-0.3% 230-805 0.440594 0.376344 13-14% 50-65% 0.801139 85-90% 0-7 1200-2000 0.5-50% No 0.000000 0.407609 0.158370 0.189200 0.372916
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 28-65% 0.117016 0.123821 25-29% 82-95% 0.420456 0-7% 0.2-0.5% 0.908541 0.222583 2.5-4 96-99% 0-0.75% 0.192507 1956-1964 0-0.16% 170-230 0.628713 0.354839 12-13% 75-95% 0.890754 93-100% 7-14 2000-3300 0.5-50% No 0.000000 0.331522 0.129065 0.303659 0.381438
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 20-23% 0.172960 0.469929 25-29% 75-82% 0.655547 12-18% 0.5-1.3% 0.607912 0.481144 2.5-4 94-96% 0.75-1.75% 0.055942 1971-1987 0-0.16% 140-170 0.455446 0.408602 10-12% 0-50% 0.371197 32-85% 7-14 2000-3300 0.5-50% Yes 0.091119 0.225543 0.136090 0.168656 0.237681
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 23-28% 0.311189 0.546580 29-34% 67-70% 0.606452 18-65% 0.2-0.5% 0.530849 0.543238 0-2.5 91-94% 1.75-4% 0.292368 1964-1971 0.3-0.5% 0-140 0.569307 0.172043 10-12% 50-65% 0.456466 85-90% 26-4000 1200-2000 0.5-50% Yes 0.050829 0.230978 0.271176 0.226961 0.336050
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 23-28% 0.166200 0.226415 34-77% 75-82% 0.641070 7-12% 0.5-1.3% 0.613235 0.479952 2.5-4 94-96% 0.75-1.75% 0.396532 1939-1956 0.16-0.3% 230-805 0.470297 0.360215 12-13% 75-95% 0.773565 93-100% 7-14 2000-3300 0.5-50% No 0.000000 0.271739 0.132276 0.249658 0.342349
In [ ]:
murders = murders.drop(['PctEmplProfServ_bins'], axis=1)
In [ ]:
murders['PctOccupManu'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
# apply normalization technique, using a range of [0,1]
murders['PctOccupManu_norm'] = (murders['PctOccupManu'] - murders['PctOccupManu'].min()) / (murders['PctOccupManu'].max() - murders['PctOccupManu'].min())    
  
murders.head()
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_bins PctOccupManu_norm
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 28-65% 0.096037
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 28-65% 0.117016
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 20-23% 0.172960
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 23-28% 0.311189
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 23-28% 0.166200
In [ ]:
murders['MalePctDivorce'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
# apply normalization technique, using a range of [0,1]
murders['MalePctDivorce_norm'] = (murders['MalePctDivorce'] - murders['MalePctDivorce'].min()) / (murders['MalePctDivorce'].max() - murders['MalePctDivorce'].min())    
  
murders.head()
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_bins PctOccupManu_norm MalePctDivorce_norm
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 28-65% 0.096037 0.090802
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 28-65% 0.117016 0.123821
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 20-23% 0.172960 0.469929
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 23-28% 0.311189 0.546580
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 23-28% 0.166200 0.226415
In [ ]:
murders['MalePctNevMarr'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
# apply normalization technique, using a range of [0,1]
murders['MalePctNevMarr_norm'] = (murders['MalePctNevMarr'] - murders['MalePctNevMarr'].min()) / (murders['MalePctNevMarr'].max() - murders['MalePctNevMarr'].min())    
  
murders.head()
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_bins PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm PctHousLess3BR_norm MedNumBR_bins PctHousOccup_bins PctVacantBoarded_bins PctVacMore6Mos_norm MedYrHousBuilt_bins PctWOFullPlumb_bins RentQrange_bins MedRentPctHousInc_norm MedOwnCostPctInc_norm MedOwnCostPctIncNoMtg_bins PctBornSameState_bins PctSameHouse85_norm PctSameState85_bins LandArea_bins PopDens_bins PctUsePubTrans_bins murdPerPop_class_target murdPerPop_reg_target householdsize_norm agePct12t21_norm agePct65up_norm PctEmplProfServ_norm MalePctNevMarr_norm
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 0.096037 0.090802 25-29% 82-95% 0.316916 7-12% 0.5-1.3% 0.937939 0.086693 2.5-4 96-99% 1.75-4% 0.435135 1956-1964 0.16-0.3% 230-805 0.440594 0.376344 13-14% 50-65% 0.801139 85-90% 0-7 1200-2000 0.5-50% No 0.000000 0.407609 0.158370 0.189200 0.372916 0.222845
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 0.117016 0.123821 25-29% 82-95% 0.420456 0-7% 0.2-0.5% 0.908541 0.222583 2.5-4 96-99% 0-0.75% 0.192507 1956-1964 0-0.16% 170-230 0.628713 0.354839 12-13% 75-95% 0.890754 93-100% 7-14 2000-3300 0.5-50% No 0.000000 0.331522 0.129065 0.303659 0.381438 0.247899
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 0.172960 0.469929 25-29% 75-82% 0.655547 12-18% 0.5-1.3% 0.607912 0.481144 2.5-4 94-96% 0.75-1.75% 0.055942 1971-1987 0-0.16% 140-170 0.455446 0.408602 10-12% 0-50% 0.371197 32-85% 7-14 2000-3300 0.5-50% Yes 0.091119 0.225543 0.136090 0.168656 0.237681 0.213508
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 0.311189 0.546580 29-34% 67-70% 0.606452 18-65% 0.2-0.5% 0.530849 0.543238 0-2.5 91-94% 1.75-4% 0.292368 1964-1971 0.3-0.5% 0-140 0.569307 0.172043 10-12% 50-65% 0.456466 85-90% 26-4000 1200-2000 0.5-50% Yes 0.050829 0.230978 0.271176 0.226961 0.336050 0.330843
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 0.166200 0.226415 34-77% 75-82% 0.641070 7-12% 0.5-1.3% 0.613235 0.479952 2.5-4 94-96% 0.75-1.75% 0.396532 1939-1956 0.16-0.3% 230-805 0.470297 0.360215 12-13% 75-95% 0.773565 93-100% 7-14 2000-3300 0.5-50% No 0.000000 0.271739 0.132276 0.249658 0.342349 0.373327
In [ ]:
murders = murders.drop(['MalePctNevMarr_bins'], axis=1)
In [ ]:
murders['PctFam2Par'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders['PctFam2Par'].min())
print(murders['PctFam2Par'].max())

#Determine bins based on quantiles
PctFam2Par_ = pd.qcut(murders['PctFam2Par'], q=4)
#Check the value counts of each bin to ensure they are balanced
PctFam2Par_.value_counts()

#Create bin labels
PctFam2Par_labels = ['0-67%', '67-70%', '75-82%', '82-95%']
#Create bins
PctFam2Par_bin = [0, 67, 75, 82, 95]
#Add new category
murders['PctFam2Par_bins'] = pd.cut(murders['PctFam2Par'], bins=PctFam2Par_bin, labels=PctFam2Par_labels)
murders.head()
32.24
93.6
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_bins PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_bins PctFam2Par_bins
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 28-65% 0.096037 0.090802 25-29% 82-95%
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 28-65% 0.117016 0.123821 25-29% 82-95%
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 20-23% 0.172960 0.469929 25-29% 75-82%
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 23-28% 0.311189 0.546580 29-34% 67-70%
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 23-28% 0.166200 0.226415 34-77% 75-82%
In [ ]:
murders['PctFam2Par_bins'].value_counts()
Out[ ]:
67-70%    533
75-82%    482
82-95%    462
0-67%     442
Name: PctFam2Par_bins, dtype: int64
In [ ]:
murders['PctWorkMomYoungKids'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
# apply normalization technique, using a range of [0,1]
murders['PctWorkMomYoungKids_norm'] = (murders['PctWorkMomYoungKids'] - murders['PctWorkMomYoungKids'].min()) / (murders['PctWorkMomYoungKids'].max() - murders['PctWorkMomYoungKids'].min())    
  
murders.head()
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_bins PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_bins PctFam2Par_bins PctWorkMomYoungKids_norm
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 28-65% 0.096037 0.090802 25-29% 82-95% 0.316916
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 28-65% 0.117016 0.123821 25-29% 82-95% 0.420456
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 20-23% 0.172960 0.469929 25-29% 75-82% 0.655547
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 23-28% 0.311189 0.546580 29-34% 67-70% 0.606452
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 23-28% 0.166200 0.226415 34-77% 75-82% 0.641070
In [ ]:
murders['PctImmigRecent'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders['PctImmigRecent'].min())
print(murders['PctImmigRecent'].max())

#Determine bins based on quantiles
PctImmigRecent_ = pd.qcut(murders['PctImmigRecent'], q=4)
#Check the value counts of each bin to ensure they are balanced
PctImmigRecent_.value_counts()

#Create bin labels
PctImmigRecent_labels = ['0-7%', '7-12%', '12-18%', '18-65%']
#Create bins
PctImmigRecent_bin = [-1, 7, 12, 18, 65]
#Add new category
murders['PctImmigRecent_bins'] = pd.cut(murders['PctImmigRecent'], bins=PctImmigRecent_bin, labels=PctImmigRecent_labels)
murders.head()
0.0
64.29
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_bins PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_bins PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 28-65% 0.096037 0.090802 25-29% 82-95% 0.316916 7-12%
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 28-65% 0.117016 0.123821 25-29% 82-95% 0.420456 0-7%
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 20-23% 0.172960 0.469929 25-29% 75-82% 0.655547 12-18%
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 23-28% 0.311189 0.546580 29-34% 67-70% 0.606452 18-65%
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 23-28% 0.166200 0.226415 34-77% 75-82% 0.641070 7-12%
In [ ]:
murders['PctImmigRecent_bins'].value_counts()
Out[ ]:
12-18%    515
18-65%    486
0-7%      482
7-12%     436
Name: PctImmigRecent_bins, dtype: int64
In [ ]:
murders['PctRecentImmig'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders['PctRecentImmig'].min())
print(murders['PctRecentImmig'].max())

#Determine bins based on quantiles
PctRecentImmig_ = pd.qcut(murders['PctRecentImmig'], q=4)
#Check the value counts of each bin to ensure they are balanced
PctRecentImmig_.value_counts()

#Create bin labels
PctRecentImmig_labels = ['0-0.2%', '0.2-0.5%', '0.5-1.3%', '1.3-13.7%']
#Create bins
PctRecentImmig_bin = [-1, 0.2, 0.5, 1.3, 13.71]
#Add new category
murders['PctRecentImmig_bins'] = pd.cut(murders['PctRecentImmig'], bins=PctRecentImmig_bin, labels=PctRecentImmig_labels)
murders.head()
0.0
13.71
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_bins PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_bins PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 28-65% 0.096037 0.090802 25-29% 82-95% 0.316916 7-12% 0.5-1.3%
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 28-65% 0.117016 0.123821 25-29% 82-95% 0.420456 0-7% 0.2-0.5%
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 20-23% 0.172960 0.469929 25-29% 75-82% 0.655547 12-18% 0.5-1.3%
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 23-28% 0.311189 0.546580 29-34% 67-70% 0.606452 18-65% 0.2-0.5%
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 23-28% 0.166200 0.226415 34-77% 75-82% 0.641070 7-12% 0.5-1.3%
In [ ]:
murders['PctRecentImmig_bins'].value_counts()
Out[ ]:
0-0.2%       526
1.3-13.7%    516
0.5-1.3%     474
0.2-0.5%     403
Name: PctRecentImmig_bins, dtype: int64
In [ ]:
murders['PctPersOwnOccup'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
# apply normalization technique, using a range of [0,1]
murders['PctPersOwnOccup_norm'] = (murders['PctPersOwnOccup'] - murders['PctPersOwnOccup'].min()) / (murders['PctPersOwnOccup'].max() - murders['PctPersOwnOccup'].min())    
  
murders.head()
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_bins PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_bins PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 28-65% 0.096037 0.090802 25-29% 82-95% 0.316916 7-12% 0.5-1.3% 0.937939
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 28-65% 0.117016 0.123821 25-29% 82-95% 0.420456 0-7% 0.2-0.5% 0.908541
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 20-23% 0.172960 0.469929 25-29% 75-82% 0.655547 12-18% 0.5-1.3% 0.607912
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 23-28% 0.311189 0.546580 29-34% 67-70% 0.606452 18-65% 0.2-0.5% 0.530849
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 23-28% 0.166200 0.226415 34-77% 75-82% 0.641070 7-12% 0.5-1.3% 0.613235
In [ ]:
murders['PctHousLess3BR'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
# apply normalization technique, using a range of [0,1]
murders['PctHousLess3BR_norm'] = (murders['PctHousLess3BR'] - murders['PctHousLess3BR'].min()) / (murders['PctHousLess3BR'].max() - murders['PctHousLess3BR'].min())    
  
murders.head()
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_bins PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_bins PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm PctHousLess3BR_norm
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 28-65% 0.096037 0.090802 25-29% 82-95% 0.316916 7-12% 0.5-1.3% 0.937939 0.086693
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 28-65% 0.117016 0.123821 25-29% 82-95% 0.420456 0-7% 0.2-0.5% 0.908541 0.222583
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 20-23% 0.172960 0.469929 25-29% 75-82% 0.655547 12-18% 0.5-1.3% 0.607912 0.481144
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 23-28% 0.311189 0.546580 29-34% 67-70% 0.606452 18-65% 0.2-0.5% 0.530849 0.543238
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 23-28% 0.166200 0.226415 34-77% 75-82% 0.641070 7-12% 0.5-1.3% 0.613235 0.479952
In [ ]:
murders['MedNumBR'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders['MedNumBR'].min())
print(murders['MedNumBR'].max())

#Determine bins based on quantiles
murders['MedNumBR'].describe()

#Create bin labels
MedNumBR_labels = ['0-2.5', '2.5-4']
#Create bins
MedNumBR_bin = [0, 2.5, 4]
#Add new category
murders['MedNumBR_bins'] = pd.cut(murders['MedNumBR'], bins=MedNumBR_bin, labels=MedNumBR_labels)
murders.head()
1
4
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_bins PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_bins PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm PctHousLess3BR_norm MedNumBR_bins
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 28-65% 0.096037 0.090802 25-29% 82-95% 0.316916 7-12% 0.5-1.3% 0.937939 0.086693 2.5-4
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 28-65% 0.117016 0.123821 25-29% 82-95% 0.420456 0-7% 0.2-0.5% 0.908541 0.222583 2.5-4
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 20-23% 0.172960 0.469929 25-29% 75-82% 0.655547 12-18% 0.5-1.3% 0.607912 0.481144 2.5-4
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 23-28% 0.311189 0.546580 29-34% 67-70% 0.606452 18-65% 0.2-0.5% 0.530849 0.543238 0-2.5
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 23-28% 0.166200 0.226415 34-77% 75-82% 0.641070 7-12% 0.5-1.3% 0.613235 0.479952 2.5-4
In [ ]:
murders['MedNumBR_bins'].value_counts()
Out[ ]:
2.5-4    1176
0-2.5     743
Name: MedNumBR_bins, dtype: int64
In [ ]:
murders['PctHousOccup'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders['PctHousOccup'].min())
print(murders['PctHousOccup'].max())

#Determine bins based on quantiles
PctHousOccup_ = pd.qcut(murders['PctHousOccup'], q=4)
#Check the value counts of each bin to ensure they are balanced
PctHousOccup_.value_counts()

#Create bin labels
PctHousOccup_labels = ['30-91%', '91-94%', '94-96%', '96-99%']
#Create bins
PctHousOccup_bin = [30, 91, 94, 96, 99]
#Add new category
murders['PctHousOccup_bins'] = pd.cut(murders['PctHousOccup'], bins=PctHousOccup_bin, labels=PctHousOccup_labels)
murders.head()
37.47
99.0
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_bins PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_bins PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm PctHousLess3BR_norm MedNumBR_bins PctHousOccup_bins
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 28-65% 0.096037 0.090802 25-29% 82-95% 0.316916 7-12% 0.5-1.3% 0.937939 0.086693 2.5-4 96-99%
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 28-65% 0.117016 0.123821 25-29% 82-95% 0.420456 0-7% 0.2-0.5% 0.908541 0.222583 2.5-4 96-99%
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 20-23% 0.172960 0.469929 25-29% 75-82% 0.655547 12-18% 0.5-1.3% 0.607912 0.481144 2.5-4 94-96%
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 23-28% 0.311189 0.546580 29-34% 67-70% 0.606452 18-65% 0.2-0.5% 0.530849 0.543238 0-2.5 91-94%
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 23-28% 0.166200 0.226415 34-77% 75-82% 0.641070 7-12% 0.5-1.3% 0.613235 0.479952 2.5-4 94-96%
In [ ]:
murders['PctHousOccup_bins'].value_counts()
Out[ ]:
94-96%    495
30-91%    486
91-94%    475
96-99%    463
Name: PctHousOccup_bins, dtype: int64
In [ ]:
murders['PctVacantBoarded'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders['PctVacantBoarded'].min())
print(murders['PctVacantBoarded'].max())

#Determine bins based on quantiles
PctVacantBoarded_ = pd.qcut(murders['PctVacantBoarded'], q=4)
#Check the value counts of each bin to ensure they are balanced
PctVacantBoarded_.value_counts()

#Create bin labels
PctVacantBoarded_labels = ['0-0.75%', '0.75-1.75%', '1.75-4%', '4-40%']
#Create bins
PctVacantBoarded_bin = [-1, 0.75, 1.75, 4, 40]
#Add new category
murders['PctVacantBoarded_bins'] = pd.cut(murders['PctVacantBoarded'], bins=PctVacantBoarded_bin, labels=PctVacantBoarded_labels)
murders.head()
0.0
39.89
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_bins PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_bins PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm PctHousLess3BR_norm MedNumBR_bins PctHousOccup_bins PctVacantBoarded_bins
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 28-65% 0.096037 0.090802 25-29% 82-95% 0.316916 7-12% 0.5-1.3% 0.937939 0.086693 2.5-4 96-99% 1.75-4%
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 28-65% 0.117016 0.123821 25-29% 82-95% 0.420456 0-7% 0.2-0.5% 0.908541 0.222583 2.5-4 96-99% 0-0.75%
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 20-23% 0.172960 0.469929 25-29% 75-82% 0.655547 12-18% 0.5-1.3% 0.607912 0.481144 2.5-4 94-96% 0.75-1.75%
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 23-28% 0.311189 0.546580 29-34% 67-70% 0.606452 18-65% 0.2-0.5% 0.530849 0.543238 0-2.5 91-94% 1.75-4%
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 23-28% 0.166200 0.226415 34-77% 75-82% 0.641070 7-12% 0.5-1.3% 0.613235 0.479952 2.5-4 94-96% 0.75-1.75%
In [ ]:
murders['PctVacantBoarded_bins'].value_counts()
Out[ ]:
1.75-4%       540
0.75-1.75%    500
0-0.75%       473
4-40%         406
Name: PctVacantBoarded_bins, dtype: int64
In [ ]:
murders['PctVacMore6Mos'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
# apply normalization technique, using a range of [0,1]
murders['PctVacMore6Mos_norm'] = (murders['PctVacMore6Mos'] - murders['PctVacMore6Mos'].min()) / (murders['PctVacMore6Mos'].max() - murders['PctVacMore6Mos'].min())    
  
murders.head()
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_bins PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_bins PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm PctHousLess3BR_norm MedNumBR_bins PctHousOccup_bins PctVacantBoarded_bins PctVacMore6Mos_norm
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 28-65% 0.096037 0.090802 25-29% 82-95% 0.316916 7-12% 0.5-1.3% 0.937939 0.086693 2.5-4 96-99% 1.75-4% 0.435135
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 28-65% 0.117016 0.123821 25-29% 82-95% 0.420456 0-7% 0.2-0.5% 0.908541 0.222583 2.5-4 96-99% 0-0.75% 0.192507
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 20-23% 0.172960 0.469929 25-29% 75-82% 0.655547 12-18% 0.5-1.3% 0.607912 0.481144 2.5-4 94-96% 0.75-1.75% 0.055942
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 23-28% 0.311189 0.546580 29-34% 67-70% 0.606452 18-65% 0.2-0.5% 0.530849 0.543238 0-2.5 91-94% 1.75-4% 0.292368
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 23-28% 0.166200 0.226415 34-77% 75-82% 0.641070 7-12% 0.5-1.3% 0.613235 0.479952 2.5-4 94-96% 0.75-1.75% 0.396532
In [ ]:
murders['MedYrHousBuilt'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders['MedYrHousBuilt'].min())
print(murders['MedYrHousBuilt'].max())

#Determine bins based on quantiles
MedYrHousBuilt_ = pd.qcut(murders['MedYrHousBuilt'], q=4)
#Check the value counts of each bin to ensure they are balanced
MedYrHousBuilt_.value_counts()

#Create bin labels
MedYrHousBuilt_labels = ['1939-1956', '1956-1964', '1964-1971', '1971-1987']
#Create bins
MedYrHousBuilt_bin = [0, 1956, 1964, 1971, 1987]
#Add new category
murders['MedYrHousBuilt_bins'] = pd.cut(murders['MedYrHousBuilt'], bins=MedYrHousBuilt_bin, labels=MedYrHousBuilt_labels)
murders.head()
1939
1987
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_bins PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_bins PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm PctHousLess3BR_norm MedNumBR_bins PctHousOccup_bins PctVacantBoarded_bins PctVacMore6Mos_norm MedYrHousBuilt_bins
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 28-65% 0.096037 0.090802 25-29% 82-95% 0.316916 7-12% 0.5-1.3% 0.937939 0.086693 2.5-4 96-99% 1.75-4% 0.435135 1956-1964
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 28-65% 0.117016 0.123821 25-29% 82-95% 0.420456 0-7% 0.2-0.5% 0.908541 0.222583 2.5-4 96-99% 0-0.75% 0.192507 1956-1964
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 20-23% 0.172960 0.469929 25-29% 75-82% 0.655547 12-18% 0.5-1.3% 0.607912 0.481144 2.5-4 94-96% 0.75-1.75% 0.055942 1971-1987
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 23-28% 0.311189 0.546580 29-34% 67-70% 0.606452 18-65% 0.2-0.5% 0.530849 0.543238 0-2.5 91-94% 1.75-4% 0.292368 1964-1971
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 23-28% 0.166200 0.226415 34-77% 75-82% 0.641070 7-12% 0.5-1.3% 0.613235 0.479952 2.5-4 94-96% 0.75-1.75% 0.396532 1939-1956
In [ ]:
murders['MedYrHousBuilt_bins'].value_counts()
Out[ ]:
1956-1964    516
1939-1956    501
1964-1971    454
1971-1987    448
Name: MedYrHousBuilt_bins, dtype: int64
In [ ]:
murders['PctWOFullPlumb'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders['PctWOFullPlumb'].min())
print(murders['PctWOFullPlumb'].max())

#Determine bins based on quantiles
PctWOFullPlumb_ = pd.qcut(murders['PctWOFullPlumb'], q=4)
#Check the value counts of each bin to ensure they are balanced
PctWOFullPlumb_.value_counts()

#Create bin labels
PctWOFullPlumb_labels = ['0-0.16%', '0.16-0.3%', '0.3-0.5%', '0.5-5%']
#Create bins
PctWOFullPlumb_bin = [-1, 0.16, 0.3, 0.5, 5.33]
#Add new category
murders['PctWOFullPlumb_bins'] = pd.cut(murders['PctWOFullPlumb'], bins=PctWOFullPlumb_bin, labels=PctWOFullPlumb_labels)
murders.head()
0.0
5.33
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_bins PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_bins PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm PctHousLess3BR_norm MedNumBR_bins PctHousOccup_bins PctVacantBoarded_bins PctVacMore6Mos_norm MedYrHousBuilt_bins PctWOFullPlumb_bins
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 28-65% 0.096037 0.090802 25-29% 82-95% 0.316916 7-12% 0.5-1.3% 0.937939 0.086693 2.5-4 96-99% 1.75-4% 0.435135 1956-1964 0.16-0.3%
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 28-65% 0.117016 0.123821 25-29% 82-95% 0.420456 0-7% 0.2-0.5% 0.908541 0.222583 2.5-4 96-99% 0-0.75% 0.192507 1956-1964 0-0.16%
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 20-23% 0.172960 0.469929 25-29% 75-82% 0.655547 12-18% 0.5-1.3% 0.607912 0.481144 2.5-4 94-96% 0.75-1.75% 0.055942 1971-1987 0-0.16%
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 23-28% 0.311189 0.546580 29-34% 67-70% 0.606452 18-65% 0.2-0.5% 0.530849 0.543238 0-2.5 91-94% 1.75-4% 0.292368 1964-1971 0.3-0.5%
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 23-28% 0.166200 0.226415 34-77% 75-82% 0.641070 7-12% 0.5-1.3% 0.613235 0.479952 2.5-4 94-96% 0.75-1.75% 0.396532 1939-1956 0.16-0.3%
In [ ]:
murders['PctWOFullPlumb_bins'].value_counts()
Out[ ]:
0.5-5%       564
0.3-0.5%     475
0-0.16%      453
0.16-0.3%    427
Name: PctWOFullPlumb_bins, dtype: int64
In [ ]:
murders['RentQrange'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders['RentQrange'].min())
print(murders['RentQrange'].max())

#Determine bins based on quantiles
RentQrange_ = pd.qcut(murders['RentQrange'], q=4)
#Check the value counts of each bin to ensure they are balanced
RentQrange_.value_counts()


#Create bin labels
RentQrange_labels = ['0-140', '140-170', '170-230', '230-805']
#Create bins
RentQrange_bin = [-1, 140, 170, 230, 803]
#Add new category
murders['RentQrange_bins'] = pd.cut(murders['RentQrange'], bins=RentQrange_bin, labels=RentQrange_labels)
murders.head()
0
803
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_bins PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_bins PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm PctHousLess3BR_norm MedNumBR_bins PctHousOccup_bins PctVacantBoarded_bins PctVacMore6Mos_norm MedYrHousBuilt_bins PctWOFullPlumb_bins RentQrange_bins
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 28-65% 0.096037 0.090802 25-29% 82-95% 0.316916 7-12% 0.5-1.3% 0.937939 0.086693 2.5-4 96-99% 1.75-4% 0.435135 1956-1964 0.16-0.3% 230-805
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 28-65% 0.117016 0.123821 25-29% 82-95% 0.420456 0-7% 0.2-0.5% 0.908541 0.222583 2.5-4 96-99% 0-0.75% 0.192507 1956-1964 0-0.16% 170-230
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 20-23% 0.172960 0.469929 25-29% 75-82% 0.655547 12-18% 0.5-1.3% 0.607912 0.481144 2.5-4 94-96% 0.75-1.75% 0.055942 1971-1987 0-0.16% 140-170
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 23-28% 0.311189 0.546580 29-34% 67-70% 0.606452 18-65% 0.2-0.5% 0.530849 0.543238 0-2.5 91-94% 1.75-4% 0.292368 1964-1971 0.3-0.5% 0-140
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 23-28% 0.166200 0.226415 34-77% 75-82% 0.641070 7-12% 0.5-1.3% 0.613235 0.479952 2.5-4 94-96% 0.75-1.75% 0.396532 1939-1956 0.16-0.3% 230-805
In [ ]:
murders['RentQrange_bins'].value_counts()
Out[ ]:
230-805    534
0-140      489
170-230    482
140-170    414
Name: RentQrange_bins, dtype: int64
In [ ]:
murders['MedRentPctHousInc'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
# apply normalization technique, using a range of [0,1]
murders['MedRentPctHousInc_norm'] = (murders['MedRentPctHousInc'] - murders['MedRentPctHousInc'].min()) / (murders['MedRentPctHousInc'].max() - murders['MedRentPctHousInc'].min())    
  
murders.head()
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_bins PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_bins PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm PctHousLess3BR_norm MedNumBR_bins PctHousOccup_bins PctVacantBoarded_bins PctVacMore6Mos_norm MedYrHousBuilt_bins PctWOFullPlumb_bins RentQrange_bins MedRentPctHousInc_norm
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 28-65% 0.096037 0.090802 25-29% 82-95% 0.316916 7-12% 0.5-1.3% 0.937939 0.086693 2.5-4 96-99% 1.75-4% 0.435135 1956-1964 0.16-0.3% 230-805 0.440594
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 28-65% 0.117016 0.123821 25-29% 82-95% 0.420456 0-7% 0.2-0.5% 0.908541 0.222583 2.5-4 96-99% 0-0.75% 0.192507 1956-1964 0-0.16% 170-230 0.628713
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 20-23% 0.172960 0.469929 25-29% 75-82% 0.655547 12-18% 0.5-1.3% 0.607912 0.481144 2.5-4 94-96% 0.75-1.75% 0.055942 1971-1987 0-0.16% 140-170 0.455446
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 23-28% 0.311189 0.546580 29-34% 67-70% 0.606452 18-65% 0.2-0.5% 0.530849 0.543238 0-2.5 91-94% 1.75-4% 0.292368 1964-1971 0.3-0.5% 0-140 0.569307
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 23-28% 0.166200 0.226415 34-77% 75-82% 0.641070 7-12% 0.5-1.3% 0.613235 0.479952 2.5-4 94-96% 0.75-1.75% 0.396532 1939-1956 0.16-0.3% 230-805 0.470297
In [ ]:
murders['MedOwnCostPctInc'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
# apply normalization technique, using a range of [0,1]
murders['MedOwnCostPctInc_norm'] = (murders['MedOwnCostPctInc'] - murders['MedOwnCostPctInc'].min()) / (murders['MedOwnCostPctInc'].max() - murders['MedOwnCostPctInc'].min())    
  
murders.head()
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_bins PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_bins PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm PctHousLess3BR_norm MedNumBR_bins PctHousOccup_bins PctVacantBoarded_bins PctVacMore6Mos_norm MedYrHousBuilt_bins PctWOFullPlumb_bins RentQrange_bins MedRentPctHousInc_norm MedOwnCostPctInc_norm
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 28-65% 0.096037 0.090802 25-29% 82-95% 0.316916 7-12% 0.5-1.3% 0.937939 0.086693 2.5-4 96-99% 1.75-4% 0.435135 1956-1964 0.16-0.3% 230-805 0.440594 0.376344
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 28-65% 0.117016 0.123821 25-29% 82-95% 0.420456 0-7% 0.2-0.5% 0.908541 0.222583 2.5-4 96-99% 0-0.75% 0.192507 1956-1964 0-0.16% 170-230 0.628713 0.354839
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 20-23% 0.172960 0.469929 25-29% 75-82% 0.655547 12-18% 0.5-1.3% 0.607912 0.481144 2.5-4 94-96% 0.75-1.75% 0.055942 1971-1987 0-0.16% 140-170 0.455446 0.408602
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 23-28% 0.311189 0.546580 29-34% 67-70% 0.606452 18-65% 0.2-0.5% 0.530849 0.543238 0-2.5 91-94% 1.75-4% 0.292368 1964-1971 0.3-0.5% 0-140 0.569307 0.172043
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 23-28% 0.166200 0.226415 34-77% 75-82% 0.641070 7-12% 0.5-1.3% 0.613235 0.479952 2.5-4 94-96% 0.75-1.75% 0.396532 1939-1956 0.16-0.3% 230-805 0.470297 0.360215
In [ ]:
murders['MedOwnCostPctIncNoMtg'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders['MedOwnCostPctIncNoMtg'].min())
print(murders['MedOwnCostPctIncNoMtg'].max())

#Determine bins based on quantiles
MedOwnCostPctIncNoMtg_ = pd.qcut(murders['MedOwnCostPctIncNoMtg'], q=4)
#Check the value counts of each bin to ensure they are balanced
MedOwnCostPctIncNoMtg_.value_counts()

#Create bin labels
MedOwnCostPctIncNoMtg_labels = ['10-12%', '12-13%', '13-14%', '14-25%']
#Create bins
MedOwnCostPctIncNoMtg_bin = [10, 12, 13, 14, 25]
#Add new category
murders['MedOwnCostPctIncNoMtg_bins'] = pd.cut(murders['MedOwnCostPctIncNoMtg'], bins=MedOwnCostPctIncNoMtg_bin, labels=MedOwnCostPctIncNoMtg_labels)
murders.head()
10.1
23.4
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_bins PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_bins PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm PctHousLess3BR_norm MedNumBR_bins PctHousOccup_bins PctVacantBoarded_bins PctVacMore6Mos_norm MedYrHousBuilt_bins PctWOFullPlumb_bins RentQrange_bins MedRentPctHousInc_norm MedOwnCostPctInc_norm MedOwnCostPctIncNoMtg_bins
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 28-65% 0.096037 0.090802 25-29% 82-95% 0.316916 7-12% 0.5-1.3% 0.937939 0.086693 2.5-4 96-99% 1.75-4% 0.435135 1956-1964 0.16-0.3% 230-805 0.440594 0.376344 13-14%
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 28-65% 0.117016 0.123821 25-29% 82-95% 0.420456 0-7% 0.2-0.5% 0.908541 0.222583 2.5-4 96-99% 0-0.75% 0.192507 1956-1964 0-0.16% 170-230 0.628713 0.354839 12-13%
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 20-23% 0.172960 0.469929 25-29% 75-82% 0.655547 12-18% 0.5-1.3% 0.607912 0.481144 2.5-4 94-96% 0.75-1.75% 0.055942 1971-1987 0-0.16% 140-170 0.455446 0.408602 10-12%
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 23-28% 0.311189 0.546580 29-34% 67-70% 0.606452 18-65% 0.2-0.5% 0.530849 0.543238 0-2.5 91-94% 1.75-4% 0.292368 1964-1971 0.3-0.5% 0-140 0.569307 0.172043 10-12%
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 23-28% 0.166200 0.226415 34-77% 75-82% 0.641070 7-12% 0.5-1.3% 0.613235 0.479952 2.5-4 94-96% 0.75-1.75% 0.396532 1939-1956 0.16-0.3% 230-805 0.470297 0.360215 12-13%
In [ ]:
murders['MedOwnCostPctIncNoMtg_bins'].value_counts()
Out[ ]:
12-13%    559
10-12%    538
13-14%    434
14-25%    388
Name: MedOwnCostPctIncNoMtg_bins, dtype: int64
In [ ]:
murders['PctBornSameState'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders['PctBornSameState'].min())
print(murders['PctBornSameState'].max())

#Determine bins based on quantiles
PctBornSameState_ = pd.qcut(murders['PctBornSameState'], q=4)
#Check the value counts of each bin to ensure they are balanced
PctBornSameState_.value_counts()

#Create bin labels
PctBornSameState_labels = ['0-50%', '50-65%', '65-75%', '75-95%']
#Create bins
PctBornSameState_bin = [0, 50, 65, 75, 95]
#Add new category
murders['PctBornSameState_bins'] = pd.cut(murders['PctBornSameState'], bins=PctBornSameState_bin, labels=PctBornSameState_labels)
murders.head()
6.75
93.14
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_bins PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_bins PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm PctHousLess3BR_norm MedNumBR_bins PctHousOccup_bins PctVacantBoarded_bins PctVacMore6Mos_norm MedYrHousBuilt_bins PctWOFullPlumb_bins RentQrange_bins MedRentPctHousInc_norm MedOwnCostPctInc_norm MedOwnCostPctIncNoMtg_bins PctBornSameState_bins
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 28-65% 0.096037 0.090802 25-29% 82-95% 0.316916 7-12% 0.5-1.3% 0.937939 0.086693 2.5-4 96-99% 1.75-4% 0.435135 1956-1964 0.16-0.3% 230-805 0.440594 0.376344 13-14% 50-65%
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 28-65% 0.117016 0.123821 25-29% 82-95% 0.420456 0-7% 0.2-0.5% 0.908541 0.222583 2.5-4 96-99% 0-0.75% 0.192507 1956-1964 0-0.16% 170-230 0.628713 0.354839 12-13% 75-95%
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 20-23% 0.172960 0.469929 25-29% 75-82% 0.655547 12-18% 0.5-1.3% 0.607912 0.481144 2.5-4 94-96% 0.75-1.75% 0.055942 1971-1987 0-0.16% 140-170 0.455446 0.408602 10-12% 0-50%
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 23-28% 0.311189 0.546580 29-34% 67-70% 0.606452 18-65% 0.2-0.5% 0.530849 0.543238 0-2.5 91-94% 1.75-4% 0.292368 1964-1971 0.3-0.5% 0-140 0.569307 0.172043 10-12% 50-65%
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 23-28% 0.166200 0.226415 34-77% 75-82% 0.641070 7-12% 0.5-1.3% 0.613235 0.479952 2.5-4 94-96% 0.75-1.75% 0.396532 1939-1956 0.16-0.3% 230-805 0.470297 0.360215 12-13% 75-95%
In [ ]:
murders['PctBornSameState_bins'].value_counts()
Out[ ]:
0-50%     535
50-65%    520
65-75%    433
75-95%    431
Name: PctBornSameState_bins, dtype: int64
In [ ]:
murders['PctSameHouse85'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
# apply normalization technique, using a range of [0,1]
murders['PctSameHouse85_norm'] = (murders['PctSameHouse85'] - murders['PctSameHouse85'].min()) / (murders['PctSameHouse85'].max() - murders['PctSameHouse85'].min())    
  
murders.head()
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_bins PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_bins PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm PctHousLess3BR_norm MedNumBR_bins PctHousOccup_bins PctVacantBoarded_bins PctVacMore6Mos_norm MedYrHousBuilt_bins PctWOFullPlumb_bins RentQrange_bins MedRentPctHousInc_norm MedOwnCostPctInc_norm MedOwnCostPctIncNoMtg_bins PctBornSameState_bins PctSameHouse85_norm
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 28-65% 0.096037 0.090802 25-29% 82-95% 0.316916 7-12% 0.5-1.3% 0.937939 0.086693 2.5-4 96-99% 1.75-4% 0.435135 1956-1964 0.16-0.3% 230-805 0.440594 0.376344 13-14% 50-65% 0.801139
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 28-65% 0.117016 0.123821 25-29% 82-95% 0.420456 0-7% 0.2-0.5% 0.908541 0.222583 2.5-4 96-99% 0-0.75% 0.192507 1956-1964 0-0.16% 170-230 0.628713 0.354839 12-13% 75-95% 0.890754
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 20-23% 0.172960 0.469929 25-29% 75-82% 0.655547 12-18% 0.5-1.3% 0.607912 0.481144 2.5-4 94-96% 0.75-1.75% 0.055942 1971-1987 0-0.16% 140-170 0.455446 0.408602 10-12% 0-50% 0.371197
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 23-28% 0.311189 0.546580 29-34% 67-70% 0.606452 18-65% 0.2-0.5% 0.530849 0.543238 0-2.5 91-94% 1.75-4% 0.292368 1964-1971 0.3-0.5% 0-140 0.569307 0.172043 10-12% 50-65% 0.456466
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 23-28% 0.166200 0.226415 34-77% 75-82% 0.641070 7-12% 0.5-1.3% 0.613235 0.479952 2.5-4 94-96% 0.75-1.75% 0.396532 1939-1956 0.16-0.3% 230-805 0.470297 0.360215 12-13% 75-95% 0.773565
In [ ]:
murders['PctSameState85'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders['PctSameState85'].min())
print(murders['PctSameState85'].max())

#Determine bins based on quantiles
PctSameState85_ = pd.qcut(murders['PctSameState85'], q=4)
#Check the value counts of each bin to ensure they are balanced
PctSameState85_.value_counts()

#Create bin labels
PctSameState85_labels = ['32-85%', '85-90%', '90-93%', '93-100%']
#Create bins
PctSameState85_bin = [32, 85, 90, 93, 100]
#Add new category
murders['PctSameState85_bins'] = pd.cut(murders['PctSameState85'], bins=PctSameState85_bin, labels=PctSameState85_labels)
murders.head()
32.83
99.9
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_bins PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_bins PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm PctHousLess3BR_norm MedNumBR_bins PctHousOccup_bins PctVacantBoarded_bins PctVacMore6Mos_norm MedYrHousBuilt_bins PctWOFullPlumb_bins RentQrange_bins MedRentPctHousInc_norm MedOwnCostPctInc_norm MedOwnCostPctIncNoMtg_bins PctBornSameState_bins PctSameHouse85_norm PctSameState85_bins
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 28-65% 0.096037 0.090802 25-29% 82-95% 0.316916 7-12% 0.5-1.3% 0.937939 0.086693 2.5-4 96-99% 1.75-4% 0.435135 1956-1964 0.16-0.3% 230-805 0.440594 0.376344 13-14% 50-65% 0.801139 85-90%
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 28-65% 0.117016 0.123821 25-29% 82-95% 0.420456 0-7% 0.2-0.5% 0.908541 0.222583 2.5-4 96-99% 0-0.75% 0.192507 1956-1964 0-0.16% 170-230 0.628713 0.354839 12-13% 75-95% 0.890754 93-100%
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 20-23% 0.172960 0.469929 25-29% 75-82% 0.655547 12-18% 0.5-1.3% 0.607912 0.481144 2.5-4 94-96% 0.75-1.75% 0.055942 1971-1987 0-0.16% 140-170 0.455446 0.408602 10-12% 0-50% 0.371197 32-85%
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 23-28% 0.311189 0.546580 29-34% 67-70% 0.606452 18-65% 0.2-0.5% 0.530849 0.543238 0-2.5 91-94% 1.75-4% 0.292368 1964-1971 0.3-0.5% 0-140 0.569307 0.172043 10-12% 50-65% 0.456466 85-90%
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 23-28% 0.166200 0.226415 34-77% 75-82% 0.641070 7-12% 0.5-1.3% 0.613235 0.479952 2.5-4 94-96% 0.75-1.75% 0.396532 1939-1956 0.16-0.3% 230-805 0.470297 0.360215 12-13% 75-95% 0.773565 93-100%
In [ ]:
murders['PctSameState85_bins'].value_counts()
Out[ ]:
32-85%     505
85-90%     500
90-93%     473
93-100%    441
Name: PctSameState85_bins, dtype: int64
In [ ]:
murders['LandArea'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders['LandArea'].min())
print(murders['LandArea'].max())

#Determine bins based on quantiles
LandArea_ = pd.qcut(murders['LandArea'], q=4)
#Check the value counts of each bin to ensure they are balanced
LandArea_.value_counts()

#Create bin labels
LandArea_labels = ['0-7', '7-14', '14-26', '26-4000']
#Create bins
LandArea_bin = [0, 7, 14, 26, 3570]
#Add new category
murders['LandArea_bins'] = pd.cut(murders['LandArea'], bins=LandArea_bin, labels=LandArea_labels)
murders.head()
0.9
3569.8
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_bins PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_bins PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm PctHousLess3BR_norm MedNumBR_bins PctHousOccup_bins PctVacantBoarded_bins PctVacMore6Mos_norm MedYrHousBuilt_bins PctWOFullPlumb_bins RentQrange_bins MedRentPctHousInc_norm MedOwnCostPctInc_norm MedOwnCostPctIncNoMtg_bins PctBornSameState_bins PctSameHouse85_norm PctSameState85_bins LandArea_bins
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 28-65% 0.096037 0.090802 25-29% 82-95% 0.316916 7-12% 0.5-1.3% 0.937939 0.086693 2.5-4 96-99% 1.75-4% 0.435135 1956-1964 0.16-0.3% 230-805 0.440594 0.376344 13-14% 50-65% 0.801139 85-90% 0-7
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 28-65% 0.117016 0.123821 25-29% 82-95% 0.420456 0-7% 0.2-0.5% 0.908541 0.222583 2.5-4 96-99% 0-0.75% 0.192507 1956-1964 0-0.16% 170-230 0.628713 0.354839 12-13% 75-95% 0.890754 93-100% 7-14
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 20-23% 0.172960 0.469929 25-29% 75-82% 0.655547 12-18% 0.5-1.3% 0.607912 0.481144 2.5-4 94-96% 0.75-1.75% 0.055942 1971-1987 0-0.16% 140-170 0.455446 0.408602 10-12% 0-50% 0.371197 32-85% 7-14
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 23-28% 0.311189 0.546580 29-34% 67-70% 0.606452 18-65% 0.2-0.5% 0.530849 0.543238 0-2.5 91-94% 1.75-4% 0.292368 1964-1971 0.3-0.5% 0-140 0.569307 0.172043 10-12% 50-65% 0.456466 85-90% 26-4000
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 23-28% 0.166200 0.226415 34-77% 75-82% 0.641070 7-12% 0.5-1.3% 0.613235 0.479952 2.5-4 94-96% 0.75-1.75% 0.396532 1939-1956 0.16-0.3% 230-805 0.470297 0.360215 12-13% 75-95% 0.773565 93-100% 7-14
In [ ]:
murders['LandArea_bins'].value_counts()
Out[ ]:
7-14       525
0-7        467
26-4000    465
14-26      462
Name: LandArea_bins, dtype: int64
In [ ]:
murders['PopDens'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders['PopDens'].min())
print(murders['PopDens'].max())

#Determine bins based on quantiles
PopDens_ = pd.qcut(murders['PopDens'], q=4)
#Check the value counts of each bin to ensure they are balanced
PopDens_.value_counts()

#Create bin labels
PopDens_labels = ['10-1200', '1200-2000', '2000-3300', '3300-45000']
#Create bins
PopDens_bin = [9, 1200, 2000, 3300, 45000]
#Add new category
murders['PopDens_bins'] = pd.cut(murders['PopDens'], bins=PopDens_bin, labels=PopDens_labels)
murders.head()
10.0
44229.9
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_bins PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_bins PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm PctHousLess3BR_norm MedNumBR_bins PctHousOccup_bins PctVacantBoarded_bins PctVacMore6Mos_norm MedYrHousBuilt_bins PctWOFullPlumb_bins RentQrange_bins MedRentPctHousInc_norm MedOwnCostPctInc_norm MedOwnCostPctIncNoMtg_bins PctBornSameState_bins PctSameHouse85_norm PctSameState85_bins LandArea_bins PopDens_bins
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 28-65% 0.096037 0.090802 25-29% 82-95% 0.316916 7-12% 0.5-1.3% 0.937939 0.086693 2.5-4 96-99% 1.75-4% 0.435135 1956-1964 0.16-0.3% 230-805 0.440594 0.376344 13-14% 50-65% 0.801139 85-90% 0-7 1200-2000
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 28-65% 0.117016 0.123821 25-29% 82-95% 0.420456 0-7% 0.2-0.5% 0.908541 0.222583 2.5-4 96-99% 0-0.75% 0.192507 1956-1964 0-0.16% 170-230 0.628713 0.354839 12-13% 75-95% 0.890754 93-100% 7-14 2000-3300
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 20-23% 0.172960 0.469929 25-29% 75-82% 0.655547 12-18% 0.5-1.3% 0.607912 0.481144 2.5-4 94-96% 0.75-1.75% 0.055942 1971-1987 0-0.16% 140-170 0.455446 0.408602 10-12% 0-50% 0.371197 32-85% 7-14 2000-3300
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 23-28% 0.311189 0.546580 29-34% 67-70% 0.606452 18-65% 0.2-0.5% 0.530849 0.543238 0-2.5 91-94% 1.75-4% 0.292368 1964-1971 0.3-0.5% 0-140 0.569307 0.172043 10-12% 50-65% 0.456466 85-90% 26-4000 1200-2000
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 23-28% 0.166200 0.226415 34-77% 75-82% 0.641070 7-12% 0.5-1.3% 0.613235 0.479952 2.5-4 94-96% 0.75-1.75% 0.396532 1939-1956 0.16-0.3% 230-805 0.470297 0.360215 12-13% 75-95% 0.773565 93-100% 7-14 2000-3300
In [ ]:
murders['PopDens_bins'].value_counts()
Out[ ]:
10-1200       496
2000-3300     488
3300-45000    478
1200-2000     457
Name: PopDens_bins, dtype: int64
In [ ]:
murders['PctUsePubTrans'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders['PctUsePubTrans'].min())
print(murders['PctUsePubTrans'].max())

#Determine bins based on quantiles
PctUsePubTrans_ = pd.qcut(murders['PctUsePubTrans'], q=4)
#Check the value counts of each bin to ensure they are balanced
PctUsePubTrans_.value_counts()

#Create bin labels
PctUsePubTrans_labels = ['0-0.4%', '0.4-1.2%', '1.2-3.3%', '3.3-55%']
#Create bins
PctUsePubTrans_bin = [-1, 0.4, 1.2, 3.3, 55]
#Add new category
murders['PctUsePubTrans_bins'] = pd.cut(murders['PctUsePubTrans'], bins=PctUsePubTrans_bin, labels=PctUsePubTrans_labels)
murders.head()
0.0
54.33
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_bins PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_bins PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm PctHousLess3BR_norm MedNumBR_bins PctHousOccup_bins PctVacantBoarded_bins PctVacMore6Mos_norm MedYrHousBuilt_bins PctWOFullPlumb_bins RentQrange_bins MedRentPctHousInc_norm MedOwnCostPctInc_norm MedOwnCostPctIncNoMtg_bins PctBornSameState_bins PctSameHouse85_norm PctSameState85_bins LandArea_bins PopDens_bins PctUsePubTrans_bins
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 28-65% 0.096037 0.090802 25-29% 82-95% 0.316916 7-12% 0.5-1.3% 0.937939 0.086693 2.5-4 96-99% 1.75-4% 0.435135 1956-1964 0.16-0.3% 230-805 0.440594 0.376344 13-14% 50-65% 0.801139 85-90% 0-7 1200-2000 3.3-55%
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 28-65% 0.117016 0.123821 25-29% 82-95% 0.420456 0-7% 0.2-0.5% 0.908541 0.222583 2.5-4 96-99% 0-0.75% 0.192507 1956-1964 0-0.16% 170-230 0.628713 0.354839 12-13% 75-95% 0.890754 93-100% 7-14 2000-3300 3.3-55%
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 20-23% 0.172960 0.469929 25-29% 75-82% 0.655547 12-18% 0.5-1.3% 0.607912 0.481144 2.5-4 94-96% 0.75-1.75% 0.055942 1971-1987 0-0.16% 140-170 0.455446 0.408602 10-12% 0-50% 0.371197 32-85% 7-14 2000-3300 3.3-55%
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 23-28% 0.311189 0.546580 29-34% 67-70% 0.606452 18-65% 0.2-0.5% 0.530849 0.543238 0-2.5 91-94% 1.75-4% 0.292368 1964-1971 0.3-0.5% 0-140 0.569307 0.172043 10-12% 50-65% 0.456466 85-90% 26-4000 1200-2000 0.4-1.2%
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 23-28% 0.166200 0.226415 34-77% 75-82% 0.641070 7-12% 0.5-1.3% 0.613235 0.479952 2.5-4 94-96% 0.75-1.75% 0.396532 1939-1956 0.16-0.3% 230-805 0.470297 0.360215 12-13% 75-95% 0.773565 93-100% 7-14 2000-3300 3.3-55%
In [ ]:
murders['PctUsePubTrans_bins'].value_counts()
Out[ ]:
0-0.4%      516
3.3-55%     495
1.2-3.3%    487
0.4-1.2%    421
Name: PctUsePubTrans_bins, dtype: int64
In [ ]:
murders['LemasPctOfficDrugUn'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
#Check the minimum and maximum value
print(murders['LemasPctOfficDrugUn'].min())
print(murders['LemasPctOfficDrugUn'].max())

#Determine bins based on quantiles
murders['LemasPctOfficDrugUn'].describe()

#Create bin labels
PctUsePubTrans_labels = ['0%', '0.5-50%']
#Create bins
PctUsePubTrans_bin = [-1, 0.44, 60]
#Add new category
murders['PctUsePubTrans_bins'] = pd.cut(murders['PctUsePubTrans'], bins=PctUsePubTrans_bin, labels=PctUsePubTrans_labels)
murders.head()
0.0
48.44
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_bins PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_bins PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm PctHousLess3BR_norm MedNumBR_bins PctHousOccup_bins PctVacantBoarded_bins PctVacMore6Mos_norm MedYrHousBuilt_bins PctWOFullPlumb_bins RentQrange_bins MedRentPctHousInc_norm MedOwnCostPctInc_norm MedOwnCostPctIncNoMtg_bins PctBornSameState_bins PctSameHouse85_norm PctSameState85_bins LandArea_bins PopDens_bins PctUsePubTrans_bins
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 28-65% 0.096037 0.090802 25-29% 82-95% 0.316916 7-12% 0.5-1.3% 0.937939 0.086693 2.5-4 96-99% 1.75-4% 0.435135 1956-1964 0.16-0.3% 230-805 0.440594 0.376344 13-14% 50-65% 0.801139 85-90% 0-7 1200-2000 0.5-50%
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 28-65% 0.117016 0.123821 25-29% 82-95% 0.420456 0-7% 0.2-0.5% 0.908541 0.222583 2.5-4 96-99% 0-0.75% 0.192507 1956-1964 0-0.16% 170-230 0.628713 0.354839 12-13% 75-95% 0.890754 93-100% 7-14 2000-3300 0.5-50%
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 20-23% 0.172960 0.469929 25-29% 75-82% 0.655547 12-18% 0.5-1.3% 0.607912 0.481144 2.5-4 94-96% 0.75-1.75% 0.055942 1971-1987 0-0.16% 140-170 0.455446 0.408602 10-12% 0-50% 0.371197 32-85% 7-14 2000-3300 0.5-50%
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 23-28% 0.311189 0.546580 29-34% 67-70% 0.606452 18-65% 0.2-0.5% 0.530849 0.543238 0-2.5 91-94% 1.75-4% 0.292368 1964-1971 0.3-0.5% 0-140 0.569307 0.172043 10-12% 50-65% 0.456466 85-90% 26-4000 1200-2000 0.5-50%
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 23-28% 0.166200 0.226415 34-77% 75-82% 0.641070 7-12% 0.5-1.3% 0.613235 0.479952 2.5-4 94-96% 0.75-1.75% 0.396532 1939-1956 0.16-0.3% 230-805 0.470297 0.360215 12-13% 75-95% 0.773565 93-100% 7-14 2000-3300 0.5-50%
In [ ]:
murders['PctUsePubTrans_bins'].value_counts()
Out[ ]:
0.5-50%    1386
0%          533
Name: PctUsePubTrans_bins, dtype: int64
In [ ]:
murders['murdPerPop'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
In [ ]:
murders_backup = murders

Prepare the murder dataset for CLASSIFICATION. Here, our target variable will be transformed into a binary categorical variable:

In [ ]:
#Check the minimum and maximum value
print(murders['murdPerPop'].min())
print(murders['murdPerPop'].max())

#Determine bins based on quantiles
murdPerPop_ = pd.qcut(murders['murdPerPop'], q=2)
#Check the value counts of each bin to ensure they are balanced
murdPerPop_.value_counts()

#Create bin labels
murdPerPop_labels = ['No', 'Yes']
#Create bins
murdPerPop_bin = [-1, 0, 100]
#Add new category
murders['murdPerPop_class_target'] = pd.cut(murders['murdPerPop'], bins=murdPerPop_bin, labels=murdPerPop_labels)
murders.head()
0.0
91.09
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins householdsize_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_bins agePct65up_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_bins PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_bins PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm PctHousLess3BR_norm MedNumBR_bins PctHousOccup_bins PctVacantBoarded_bins PctVacMore6Mos_norm MedYrHousBuilt_bins PctWOFullPlumb_bins RentQrange_bins MedRentPctHousInc_norm MedOwnCostPctInc_norm MedOwnCostPctIncNoMtg_bins PctBornSameState_bins PctSameHouse85_norm PctSameState85_bins LandArea_bins PopDens_bins PctUsePubTrans_bins murdPerPop_class_target
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 2.8-5.3 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 12.3-13.6% 8.8-11.7% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 28-65% 0.096037 0.090802 25-29% 82-95% 0.316916 7-12% 0.5-1.3% 0.937939 0.086693 2.5-4 96-99% 1.75-4% 0.435135 1956-1964 0.16-0.3% 230-805 0.440594 0.376344 13-14% 50-65% 0.801139 85-90% 0-7 1200-2000 0.5-50% No
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 2.6-2.8 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 4.5-12.2% 14.4-52.7% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 28-65% 0.117016 0.123821 25-29% 82-95% 0.420456 0-7% 0.2-0.5% 0.908541 0.222583 2.5-4 96-99% 0-0.75% 0.192507 1956-1964 0-0.16% 170-230 0.628713 0.354839 12-13% 75-95% 0.890754 93-100% 7-14 2000-3300 0.5-50% No
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 1.5-2.5 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 4.5-12.2% 8.8-11.7% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 20-23% 0.172960 0.469929 25-29% 75-82% 0.655547 12-18% 0.5-1.3% 0.607912 0.481144 2.5-4 94-96% 0.75-1.75% 0.055942 1971-1987 0-0.16% 140-170 0.455446 0.408602 10-12% 0-50% 0.371197 32-85% 7-14 2000-3300 0.5-50% Yes
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 1.5-2.5 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 15.4-54.4% 11.7-14.4% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 23-28% 0.311189 0.546580 29-34% 67-70% 0.606452 18-65% 0.2-0.5% 0.530849 0.543238 0-2.5 91-94% 1.75-4% 0.292368 1964-1971 0.3-0.5% 0-140 0.569307 0.172043 10-12% 50-65% 0.456466 85-90% 26-4000 1200-2000 0.5-50% Yes
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 2.5-2.6 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 4.5-12.2% 11.7-14.4% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 23-28% 0.166200 0.226415 34-77% 75-82% 0.641070 7-12% 0.5-1.3% 0.613235 0.479952 2.5-4 94-96% 0.75-1.75% 0.396532 1939-1956 0.16-0.3% 230-805 0.470297 0.360215 12-13% 75-95% 0.773565 93-100% 7-14 2000-3300 0.5-50% No
In [ ]:
murders['murdPerPop_class_target'].value_counts()
Out[ ]:
Yes    1043
No      876
Name: murdPerPop_class_target, dtype: int64
In [ ]:
murders_classification = murders[['state','pop_bins','householdsize_norm', 'racepctblack_bins', 'racePctWhite_bins', 
                                  'racePctAsian_bins', 'racePctHisp_bins', 'agePct12t21_norm', 'agePct65up_norm', 'pctUrban_bins', 
                                  'medIncome_bins', 'pctWFarmSelf_bins', 'pctWInvInc_norm', 'pctWPubAsst_bins', 'pctWRetire_norm', 
                                  'blackPerCap_bins', 'indianPerCap_bins', 'AsianPerCap_bins', 'OtherPerCap_bins', 'HispPerCap_bins', 
                                  'PctPopUnderPov_bins', 'PctLess9thGrade_bins', 'PctBSorMore_bins','PctEmplManu_norm','PctEmplProfServ_norm',
                                  'PctOccupManu_norm','MalePctDivorce_norm','MalePctNevMarr_norm','PctFam2Par_bins','PctWorkMomYoungKids_norm',
                                  'PctImmigRecent_bins','PctRecentImmig_bins','PctPersOwnOccup_norm','PctHousLess3BR_norm','MedNumBR_bins',
                                  'PctHousOccup_bins','PctVacantBoarded_bins','PctVacMore6Mos_norm','MedYrHousBuilt_bins','PctWOFullPlumb_bins',
                                  'RentQrange_bins','MedRentPctHousInc_norm','MedOwnCostPctInc_norm','MedOwnCostPctIncNoMtg_bins',
                                  'PctBornSameState_bins','PctSameHouse85_norm','PctSameState85_bins','LandArea_bins','PopDens_bins',
                                  'PctUsePubTrans_bins', 'murdPerPop_class_target']].copy()

murders_classification.head()
murders_classification.info()
In [ ]:
murders_class_backup = murders_classification
murders_class_backup.info()
In [ ]:
murders_classification['state'] = murders_classification['state'].astype('category')
In [ ]:
murders_class_backup = murders_classification

Prepare the murder dataset for REGRESSION. Here, the target variable will be normalized numerically.

***min/max scalar function

In [ ]:
# apply normalization technique, using a range of [0,1]
murders['murdPerPop_reg_target'] = (murders['murdPerPop'] - murders['murdPerPop'].min()) / (murders['murdPerPop'].max() - murders['murdPerPop'].min())    
  
murders.head()
In [ ]:
murders_regression = murders[['state','pop_bins','householdsize_norm', 'racepctblack_bins', 'racePctWhite_bins', 
                              'racePctAsian_bins', 'racePctHisp_bins', 'agePct12t21_norm', 'agePct65up_norm', 'pctUrban_bins', 
                              'medIncome_bins', 'pctWFarmSelf_bins', 'pctWInvInc_norm', 'pctWPubAsst_bins', 'pctWRetire_norm', 
                              'blackPerCap_bins', 'indianPerCap_bins', 'AsianPerCap_bins', 'OtherPerCap_bins', 'HispPerCap_bins', 
                              'PctPopUnderPov_bins', 'PctLess9thGrade_bins', 'PctBSorMore_bins','PctEmplManu_norm','PctEmplProfServ_norm',
                              'PctOccupManu_norm','MalePctDivorce_norm','MalePctNevMarr_norm','PctFam2Par_bins','PctWorkMomYoungKids_norm',
                              'PctImmigRecent_bins','PctRecentImmig_bins','PctPersOwnOccup_norm','PctHousLess3BR_norm','MedNumBR_bins',
                              'PctHousOccup_bins','PctVacantBoarded_bins','PctVacMore6Mos_norm','MedYrHousBuilt_bins','PctWOFullPlumb_bins',
                              'RentQrange_bins','MedRentPctHousInc_norm','MedOwnCostPctInc_norm','MedOwnCostPctIncNoMtg_bins',
                              'PctBornSameState_bins','PctSameHouse85_norm','PctSameState85_bins','LandArea_bins','PopDens_bins',
                              'PctUsePubTrans_bins', 'murdPerPop_reg_target']].copy()

murders_regression.head()
Out[ ]:
state pop_bins householdsize_norm racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_norm agePct65up_norm pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_norm PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_norm PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm PctHousLess3BR_norm MedNumBR_bins PctHousOccup_bins PctVacantBoarded_bins PctVacMore6Mos_norm MedYrHousBuilt_bins PctWOFullPlumb_bins RentQrange_bins MedRentPctHousInc_norm MedOwnCostPctInc_norm MedOwnCostPctIncNoMtg_bins PctBornSameState_bins PctSameHouse85_norm PctSameState85_bins LandArea_bins PopDens_bins PctUsePubTrans_bins murdPerPop_reg_target
0 NJ 10000-13500 0.407609 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 0.158370 0.189200 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 0.372916 0.096037 0.090802 0.222845 82-95% 0.316916 7-12% 0.5-1.3% 0.937939 0.086693 2.5-4 96-99% 1.75-4% 0.435135 1956-1964 0.16-0.3% 230-805 0.440594 0.376344 13-14% 50-65% 0.801139 85-90% 0-7 1200-2000 0.5-50% 0.000000
1 PA 19000-29000 0.331522 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 0.129065 0.303659 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 0.381438 0.117016 0.123821 0.247899 82-95% 0.420456 0-7% 0.2-0.5% 0.908541 0.222583 2.5-4 96-99% 0-0.75% 0.192507 1956-1964 0-0.16% 170-230 0.628713 0.354839 12-13% 75-95% 0.890754 93-100% 7-14 2000-3300 0.5-50% 0.000000
2 OR 29000-51500 0.225543 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 0.136090 0.168656 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 0.237681 0.172960 0.469929 0.213508 75-82% 0.655547 12-18% 0.5-1.3% 0.607912 0.481144 2.5-4 94-96% 0.75-1.75% 0.055942 1971-1987 0-0.16% 140-170 0.455446 0.408602 10-12% 0-50% 0.371197 32-85% 7-14 2000-3300 0.5-50% 0.091119
4 MO 515000-7500000 0.230978 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 0.271176 0.226961 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 0.336050 0.311189 0.546580 0.330843 67-70% 0.606452 18-65% 0.2-0.5% 0.530849 0.543238 0-2.5 91-94% 1.75-4% 0.292368 1964-1971 0.3-0.5% 0-140 0.569307 0.172043 10-12% 50-65% 0.456466 85-90% 26-4000 1200-2000 0.5-50% 0.050829
5 MA 19000-29000 0.271739 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 0.132276 0.249658 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 0.342349 0.166200 0.226415 0.373327 75-82% 0.641070 7-12% 0.5-1.3% 0.613235 0.479952 2.5-4 94-96% 0.75-1.75% 0.396532 1939-1956 0.16-0.3% 230-805 0.470297 0.360215 12-13% 75-95% 0.773565 93-100% 7-14 2000-3300 0.5-50% 0.000000
In [ ]:
murders_reg_backup = murders_regression
In [ ]:
murders_regression['state'] = murders_regression['state'].astype('category')

Here, I want to make sure both murders and robberies datasets contain the same columns, to avoid repeating the same normalization steps.

In [ ]:
print(murders_copy.columns.difference(robberies.columns))
print(robberies.columns.difference(murders_copy.columns))
Index(['murdPerPop'], dtype='object')
Index(['robbbPerPop'], dtype='object')

Both dataframes have the exact columns, other than their target variables. Because of this, I will only normalize the robbbPerPop target variable, and create new datasets in preparation for our regression and classification models using previously normalized columns from the murders dataset.

In [ ]:
murders.head()
Out[ ]:
communityname state population householdsize racepctblack racePctWhite racePctAsian racePctHisp agePct12t21 agePct65up pctUrban medIncome pctWFarmSelf pctWInvInc pctWPubAsst pctWRetire blackPerCap indianPerCap AsianPerCap OtherPerCap HispPerCap PctPopUnderPov PctLess9thGrade PctBSorMore PctEmplManu PctEmplProfServ PctOccupManu MalePctDivorce MalePctNevMarr PctFam2Par PctWorkMomYoungKids PctImmigRecent PctRecentImmig PctPersOwnOccup PctHousLess3BR MedNumBR PctHousOccup PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctWOFullPlumb RentQrange MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg PctBornSameState PctSameHouse85 PctSameState85 LandArea PopDens PctUsePubTrans LemasPctOfficDrugUn murdPerPop pop_bins racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctOccupManu_norm MalePctDivorce_norm PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm PctHousLess3BR_norm MedNumBR_bins PctHousOccup_bins PctVacantBoarded_bins PctVacMore6Mos_norm MedYrHousBuilt_bins PctWOFullPlumb_bins RentQrange_bins MedRentPctHousInc_norm MedOwnCostPctInc_norm MedOwnCostPctIncNoMtg_bins PctBornSameState_bins PctSameHouse85_norm PctSameState85_bins LandArea_bins PopDens_bins PctUsePubTrans_bins murdPerPop_class_target murdPerPop_reg_target householdsize_norm agePct12t21_norm agePct65up_norm PctEmplProfServ_norm MalePctNevMarr_norm
0 BerkeleyHeightstownship NJ 11980 3.10 1.37 91.78 6.50 1.88 12.47 11.33 100.0 75122 1.55 70.20 1.03 18.39 13600 5725 27101 5115.0 22838 1.96 5.81 48.18 14.65 28.82 5.49 3.67 26.38 91.43 44.56 8.69 0.93 91.46 11.06 3 98.37 3.12 37.50 1959 0.28 316 23.8 21.1 14.0 53.72 65.29 89.14 6.5 1845.9 9.63 0.0 0.00 10000-13500 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 0.096037 0.090802 82-95% 0.316916 7-12% 0.5-1.3% 0.937939 0.086693 2.5-4 96-99% 1.75-4% 0.435135 1956-1964 0.16-0.3% 230-805 0.440594 0.376344 13-14% 50-65% 0.801139 85-90% 0-7 1200-2000 0.5-50% No 0.000000 0.407609 0.158370 0.189200 0.372916 0.222845
1 Marpletownship PA 23123 2.82 0.80 95.57 3.44 0.85 11.01 17.18 100.0 47917 1.11 64.11 2.75 22.85 18137 0 20074 5250.0 12222 3.98 5.61 29.89 12.26 29.28 6.39 4.23 27.99 86.91 51.14 5.21 0.43 89.03 23.60 3 97.15 0.00 18.33 1958 0.14 205 27.6 20.7 12.5 77.17 71.27 96.12 10.6 2186.7 3.84 0.0 0.00 19000-29000 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 0.117016 0.123821 82-95% 0.420456 0-7% 0.2-0.5% 0.908541 0.222583 2.5-4 96-99% 0-0.75% 0.192507 1956-1964 0-0.16% 170-230 0.628713 0.354839 12-13% 75-95% 0.890754 93-100% 7-14 2000-3300 0.5-50% No 0.000000 0.331522 0.129065 0.303659 0.381438 0.247899
2 Tigardcity OR 29344 2.43 0.74 94.33 3.43 2.35 11.36 10.28 100.0 35669 1.15 55.73 2.94 14.56 16644 21606 15528 5954.0 8405 4.75 2.80 30.13 15.95 21.52 8.79 10.10 25.78 78.54 66.08 16.42 0.82 64.18 47.46 3 95.68 0.92 7.54 1976 0.12 150 24.1 21.7 11.6 44.77 36.60 82.85 10.6 2780.9 4.37 0.0 8.30 29000-51500 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 0.172960 0.469929 75-82% 0.655547 12-18% 0.5-1.3% 0.607912 0.481144 2.5-4 94-96% 0.75-1.75% 0.055942 1971-1987 0-0.16% 140-170 0.455446 0.408602 10-12% 0-50% 0.371197 32-85% 7-14 2000-3300 0.5-50% Yes 0.091119 0.225543 0.136090 0.168656 0.237681 0.213508
4 Springfieldcity MO 140494 2.45 2.51 95.65 0.90 0.95 18.09 13.26 100.0 21577 1.00 41.15 7.12 14.09 7382 10264 10753 7192.0 8104 17.78 8.76 20.66 14.31 26.83 14.72 11.40 33.32 71.94 62.96 21.33 0.32 57.81 53.19 2 91.81 2.09 26.22 1966 0.31 134 26.4 17.3 11.7 64.35 42.29 85.66 70.4 1995.7 0.97 0.0 4.63 515000-7500000 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 0.311189 0.546580 67-70% 0.606452 18-65% 0.2-0.5% 0.530849 0.543238 0-2.5 91-94% 1.75-4% 0.292368 1964-1971 0.3-0.5% 0-140 0.569307 0.172043 10-12% 50-65% 0.456466 85-90% 26-4000 1200-2000 0.5-50% Yes 0.050829 0.230978 0.271176 0.226961 0.336050 0.330843
5 Norwoodtown MA 28700 2.60 1.60 96.57 1.47 1.10 11.17 14.42 100.0 42805 0.39 47.70 5.41 17.23 17342 21482 12639 21852.0 22594 4.01 4.49 27.01 14.02 27.17 8.50 5.97 36.05 79.53 65.16 11.38 1.05 64.62 47.35 3 95.11 1.41 34.45 1956 0.28 361 24.4 20.8 12.5 77.30 63.45 93.53 10.9 2643.5 9.62 0.0 0.00 19000-29000 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 0.166200 0.226415 75-82% 0.641070 7-12% 0.5-1.3% 0.613235 0.479952 2.5-4 94-96% 0.75-1.75% 0.396532 1939-1956 0.16-0.3% 230-805 0.470297 0.360215 12-13% 75-95% 0.773565 93-100% 7-14 2000-3300 0.5-50% No 0.000000 0.271739 0.132276 0.249658 0.342349 0.373327
In [ ]:
#Create our normalized robberies dataframes

#REGRESSION DATASET
robberies_regression = murders[['state','pop_bins','householdsize_norm', 'racepctblack_bins', 'racePctWhite_bins', 
                                'racePctAsian_bins', 'racePctHisp_bins', 'agePct12t21_norm', 'agePct65up_norm', 'pctUrban_bins', 'medIncome_bins',
                                  'pctWFarmSelf_bins', 'pctWInvInc_norm', 'pctWPubAsst_bins', 'pctWRetire_norm', 'blackPerCap_bins', 
                                  'indianPerCap_bins', 'AsianPerCap_bins', 'OtherPerCap_bins', 'HispPerCap_bins', 'PctPopUnderPov_bins', 
                                  'PctLess9thGrade_bins', 'PctBSorMore_bins','PctEmplManu_norm','PctEmplProfServ_norm','PctOccupManu_norm',
                                  'MalePctDivorce_norm','MalePctNevMarr_norm','PctFam2Par_bins','PctWorkMomYoungKids_norm','PctImmigRecent_bins',
                                  'PctRecentImmig_bins','PctPersOwnOccup_norm','PctHousLess3BR_norm','MedNumBR_bins','PctHousOccup_bins',
                                  'PctVacantBoarded_bins','PctVacMore6Mos_norm','MedYrHousBuilt_bins','PctWOFullPlumb_bins','RentQrange_bins',
                                  'MedRentPctHousInc_norm','MedOwnCostPctInc_norm','MedOwnCostPctIncNoMtg_bins','PctBornSameState_bins',
                                  'PctSameHouse85_norm','PctSameState85_bins','LandArea_bins','PopDens_bins','PctUsePubTrans_bins']].copy()

robberies_regression.info()
In [ ]:
robberies_regression['state'] = robberies_regression['state'].astype('category')
In [ ]:
# apply normalization technique, using a range of [0,1]
robberies_regression['robbbPerPop_reg_target'] = (robberies['robbbPerPop'] - robberies['robbbPerPop'].min()) / (robberies['robbbPerPop'].max() - robberies['robbbPerPop'].min())    
  
robberies_regression.head()
In [ ]:
#CLASSIFICATION DATASET
robberies_classification = murders[['state','pop_bins','householdsize_norm', 'racepctblack_bins', 'racePctWhite_bins', 
                                    'racePctAsian_bins', 'racePctHisp_bins', 'agePct12t21_norm', 'agePct65up_norm', 'pctUrban_bins', 
                                    'medIncome_bins', 'pctWFarmSelf_bins', 'pctWInvInc_norm', 'pctWPubAsst_bins', 'pctWRetire_norm', 
                                    'blackPerCap_bins', 'indianPerCap_bins', 'AsianPerCap_bins', 'OtherPerCap_bins', 'HispPerCap_bins', 
                                    'PctPopUnderPov_bins', 'PctLess9thGrade_bins', 'PctBSorMore_bins','PctEmplManu_norm','PctEmplProfServ_norm',
                                    'PctOccupManu_norm','MalePctDivorce_norm','MalePctNevMarr_norm','PctFam2Par_bins','PctWorkMomYoungKids_norm',
                                    'PctImmigRecent_bins','PctRecentImmig_bins','PctPersOwnOccup_norm','PctHousLess3BR_norm','MedNumBR_bins',
                                    'PctHousOccup_bins','PctVacantBoarded_bins','PctVacMore6Mos_norm','MedYrHousBuilt_bins','PctWOFullPlumb_bins',
                                    'RentQrange_bins','MedRentPctHousInc_norm','MedOwnCostPctInc_norm','MedOwnCostPctIncNoMtg_bins',
                                    'PctBornSameState_bins','PctSameHouse85_norm','PctSameState85_bins','LandArea_bins','PopDens_bins',
                                    'PctUsePubTrans_bins']].copy()

robberies_classification.info()
In [ ]:
robberies_classification['state']=robberies_classification['state'].astype('category')
In [ ]:
#Check the minimum and maximum value
print(robberies['robbbPerPop'].min())
print(robberies['robbbPerPop'].max())

#Determine bins based on quantiles
robbbPerPop_ = pd.qcut(robberies['robbbPerPop'], q=3)
#Check the value counts of each bin to ensure they are balanced
robbbPerPop_.value_counts()

#Create bin labels
robbbPerPop_labels = ['Unlikely', 'Likely', 'Very Likely']
#Create bins
robbbPerPop_bin = [-1, 40, 145, 2500]
#Add new category
robberies_classification['robbbPerPop_class_target'] = pd.cut(robberies['robbbPerPop'], bins=robbbPerPop_bin, labels=robbbPerPop_labels)
robberies_classification.head()
0.0
2264.13
Out[ ]:
state pop_bins householdsize_norm racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_norm agePct65up_norm pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_norm PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_norm PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm PctHousLess3BR_norm MedNumBR_bins PctHousOccup_bins PctVacantBoarded_bins PctVacMore6Mos_norm MedYrHousBuilt_bins PctWOFullPlumb_bins RentQrange_bins MedRentPctHousInc_norm MedOwnCostPctInc_norm MedOwnCostPctIncNoMtg_bins PctBornSameState_bins PctSameHouse85_norm PctSameState85_bins LandArea_bins PopDens_bins PctUsePubTrans_bins robbbPerPop_class_target
0 NJ 10000-13500 0.407609 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 0.158370 0.189200 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 0.372916 0.096037 0.090802 0.222845 82-95% 0.316916 7-12% 0.5-1.3% 0.937939 0.086693 2.5-4 96-99% 1.75-4% 0.435135 1956-1964 0.16-0.3% 230-805 0.440594 0.376344 13-14% 50-65% 0.801139 85-90% 0-7 1200-2000 0.5-50% Unlikely
1 PA 19000-29000 0.331522 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 0.129065 0.303659 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 0.381438 0.117016 0.123821 0.247899 82-95% 0.420456 0-7% 0.2-0.5% 0.908541 0.222583 2.5-4 96-99% 0-0.75% 0.192507 1956-1964 0-0.16% 170-230 0.628713 0.354839 12-13% 75-95% 0.890754 93-100% 7-14 2000-3300 0.5-50% Unlikely
2 OR 29000-51500 0.225543 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 0.136090 0.168656 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 0.237681 0.172960 0.469929 0.213508 75-82% 0.655547 12-18% 0.5-1.3% 0.607912 0.481144 2.5-4 94-96% 0.75-1.75% 0.055942 1971-1987 0-0.16% 140-170 0.455446 0.408602 10-12% 0-50% 0.371197 32-85% 7-14 2000-3300 0.5-50% Very Likely
4 MO 515000-7500000 0.230978 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 0.271176 0.226961 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 0.336050 0.311189 0.546580 0.330843 67-70% 0.606452 18-65% 0.2-0.5% 0.530849 0.543238 0-2.5 91-94% 1.75-4% 0.292368 1964-1971 0.3-0.5% 0-140 0.569307 0.172043 10-12% 50-65% 0.456466 85-90% 26-4000 1200-2000 0.5-50% Likely
5 MA 19000-29000 0.271739 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 0.132276 0.249658 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 0.342349 0.166200 0.226415 0.373327 75-82% 0.641070 7-12% 0.5-1.3% 0.613235 0.479952 2.5-4 94-96% 0.75-1.75% 0.396532 1939-1956 0.16-0.3% 230-805 0.470297 0.360215 12-13% 75-95% 0.773565 93-100% 7-14 2000-3300 0.5-50% Unlikely
In [ ]:
robberies_classification['robbbPerPop_class_target'].value_counts()
Out[ ]:
Likely         656
Very Likely    642
Unlikely       621
Name: robbbPerPop_class_target, dtype: int64

In [ ]:
#MAKING COPIES OF ALL DATASETS FOR BACKUP

murd_reg_copy = murders_regression.copy()
rob_reg_copy = robberies_regression.copy()

murders_class_copy = murders_classification.copy()
rob_class_copy = robberies_classification.copy()

murders_reg_backup = murders_regression
robberies_reg_backup = robberies_regression

murdregcopy = murders_regression.copy()

MURDERS DATASET:

In [ ]:
#Make a backup copy of our dataset
murdclasscopy = murders_class_copy
In [ ]:
#Encode the cateogorical variables with LABEL ENCODING METHOD
#This code block is a test run, encoding the column 'state'
#If successful, the rest of the categorical variables will be encoded the same way

from sklearn.preprocessing import LabelEncoder

#create instance of label encoder
lab = LabelEncoder()

#perform label encoding on 'team' column
murdclasscopy['state'] = lab.fit_transform(murdclasscopy['state'])
In [ ]:
#Encode the rest of the categorical variables with for loop function

for cols in murdclasscopy.columns:
    if murdclasscopy[cols].dtype == 'category':
        murdclasscopy[cols] = lab.fit_transform(murdclasscopy[cols])
    else:
       pass
murdclasscopy.head()
Out[ ]:
state pop_bins householdsize_norm racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_norm agePct65up_norm pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_norm PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_norm PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm PctHousLess3BR_norm MedNumBR_bins PctHousOccup_bins PctVacantBoarded_bins PctVacMore6Mos_norm MedYrHousBuilt_bins PctWOFullPlumb_bins RentQrange_bins MedRentPctHousInc_norm MedOwnCostPctInc_norm MedOwnCostPctIncNoMtg_bins PctBornSameState_bins PctSameHouse85_norm PctSameState85_bins LandArea_bins PopDens_bins PctUsePubTrans_bins murdPerPop_class_target
0 25 0 0.407609 1 2 3 1 0.158370 0.189200 2 3 3 0.764559 0 0.355054 1 0 2 0 2 0 2 3 0.262609 0.372916 0.096037 0.090802 0.222845 3 0.316916 3 2 0.937939 0.086693 1 3 2 0.435135 1 1 3 0.440594 0.376344 2 1 0.801139 1 0 1 1 0
1 32 2 0.331522 0 2 3 0 0.129065 0.303659 2 3 3 0.688453 0 0.461118 2 0 2 0 1 0 2 2 0.212797 0.381438 0.117016 0.123821 0.247899 3 0.420456 0 1 0.908541 0.222583 1 3 0 0.192507 1 0 2 0.628713 0.354839 1 3 0.890754 3 3 2 1 0
2 31 3 0.225543 0 2 3 1 0.136090 0.168656 2 1 3 0.583729 0 0.263971 2 2 1 2 3 0 0 3 0.289704 0.237681 0.172960 0.469929 0.213508 2 0.655547 1 2 0.607912 0.481144 1 2 1 0.055942 3 0 1 0.455446 0.408602 0 0 0.371197 0 3 2 1 1
4 20 4 0.230978 1 2 1 0 0.271176 0.226961 2 2 2 0.401525 2 0.252794 3 1 3 2 3 2 3 2 0.255523 0.336050 0.311189 0.546580 0.330843 1 0.606452 2 1 0.530849 0.543238 0 1 2 0.292368 2 2 0 0.569307 0.172043 0 1 0.456466 1 2 1 1 1
5 16 2 0.271739 1 3 2 1 0.132276 0.249658 2 3 0 0.483379 2 0.327467 2 2 1 1 2 0 0 2 0.249479 0.342349 0.166200 0.226415 0.373327 2 0.641070 3 2 0.613235 0.479952 1 2 1 0.396532 0 1 3 0.470297 0.360215 1 3 0.773565 3 3 2 1 0
In [ ]:
#check info of our dataframe to ensure it is of the correct type, and that all of the categorical attributes were transformed

murdclasscopy.info()

Now we are ready to run our models. First, I will split the data into train and test sets. I will be running a LOGISTIC REGRESSION model on both the murders and robberies datasets, both preprocessed specifically for this task. Then I will running my 3 classification models: KNN, Decision Tree, and Random forest. Finally, I will be running a confusion matrix for each model. I will only compare the 3 classification models to each other, using 3 evaluation metrics: Accuracy, Precision, and Recall. The logistic regression model is for exploration purposes only, for the sake of curiosity.

Next, I will use K-Folds cross validation for my train/test split, and re-run each model for both the MURDERS and ROBBERIES dataset. I will examine the confusion matrices and run evaluation metrics again.

MURDERS DATASET:

In [ ]:
#Split the data into train test split
from sklearn.model_selection import train_test_split

#Split the dataset into training set and test set
#Our class column is murdPerPop_class_target, everything else will be used as features
class_murd_colname = 'murdPerPop_class_target'
feature_murd_names = murdclasscopy.columns[murdclasscopy.columns != class_murd_colname]

#70% training and 30% test
x1_train, x1_test, y1_train, y1_test = train_test_split(murdclasscopy.loc[:, feature_murd_names], 
                                                    murdclasscopy[class_murd_colname], test_size=0.3, random_state=42)
In [ ]:
#LOGISTIC REGRESSION

from sklearn.linear_model import LogisticRegression

logmodel = LogisticRegression(class_weight= 'balanced')
logmodel.fit(x1_train, y1_train)
predictions_log = logmodel.predict(x1_test)

predictions_log
In [ ]:
#KNN ALGORITHM

from sklearn.neighbors import KNeighborsClassifier 
 
classifier = KNeighborsClassifier(n_neighbors=5) 
classifier.fit(x1_train, y1_train) 

murd_pred = classifier.predict(x1_test)  
murd_pred
In [ ]:
#DECISION TREE

from matplotlib import pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

#create and train the model

clf = DecisionTreeClassifier(max_depth=5, random_state=1234)
model = clf.fit(x1_train, y1_train)

#Predict the response for test dataset
murd_treepred = clf.predict(x1_test)
murd_treepred
In [ ]:
from sklearn.tree import plot_tree

#plot tree (visual representation)

features = feature_murd_names
classes = class_murd_colname

plt.figure(figsize=(40, 40))
plot_tree(clf,
          fontsize=10,
          feature_names=features,
          class_names=classes,
          rounded=True,
          filled=True,
          proportion=True); 
In [ ]:
#plot tree (textual representation)

text_representation = tree.export_text(clf)
print(text_representation)
In [ ]:
#RANDOM FOREST
from sklearn.ensemble import RandomForestClassifier

randf=RandomForestClassifier()
randf.fit(x1_train, y1_train)

murd_rf_pred = randf.predict(x1_test)
murd_rf_pred
In [ ]:
#CONFUSION MATRICES

from sklearn.metrics import confusion_matrix
log_cm = confusion_matrix(y1_test, predictions_log)
knn_cm = confusion_matrix(y1_test, murd_pred)
dt_cm = confusion_matrix(y1_test, murd_treepred)
rf_cm = confusion_matrix(y1_test, murd_rf_pred)

print('LOGISTIC REGRESSION CONFUSION MATRIX')
print(log_cm)
print('KNN CONFUSION MATRIX')
print(knn_cm)
print('DECISION TREE CONFUSION MATRIX')
print(dt_cm)
print('RANDOM FOREST CONFUSION MATRIX')
print(rf_cm)
LOGISTIC REGRESSION CONFUSION MATRIX
[[206  53]
 [ 73 244]]
KNN CONFUSION MATRIX
[[183  76]
 [ 72 245]]
DECISION TREE CONFUSION MATRIX
[[197  62]
 [ 74 243]]
RANDOM FOREST CONFUSION MATRIX
[[195  64]
 [ 74 243]]
In [ ]:
#EVALUATION METRICS

log_accuracy = (log_cm[0][0] + log_cm[1][1])/(len(y1_test))
log_precision = log_cm[0][0]/(log_cm[1][0] + log_cm[0][0])
log_recall = (log_cm[0][0]/(log_cm[0][0] + log_cm[0][1]))

knn_accuracy = (knn_cm[0][0] + knn_cm[1][1])/(len(y1_test))
knn_precision = knn_cm[0][0]/(knn_cm[1][0] + knn_cm[0][0])
knn_recall = (knn_cm[0][0]/(knn_cm[0][0] + knn_cm[0][1]))

dt_accuracy = (dt_cm[0][0] + dt_cm[1][1])/(len(y1_test))
dt_precision = dt_cm[0][0]/(dt_cm[1][0] + dt_cm[0][0])
dt_recall = (dt_cm[0][0]/(dt_cm[0][0] + dt_cm[0][1]))

rf_accuracy = (rf_cm[0][0] + rf_cm[1][1])/(len(y1_test))
rf_precision = rf_cm[0][0]/(rf_cm[1][0] + rf_cm[0][0])
rf_recall = (rf_cm[0][0]/(rf_cm[0][0] + rf_cm[0][1]))

print('ACCURACY')
print('Logistic Regression: ACCURACY=', log_accuracy)
print('KNN Classifier: ACCURACY=', knn_accuracy)
print('Decision Tree: ACCURACY=', dt_accuracy)
print('Random Forest: ACCURACY=', rf_accuracy)

print('PRECISION')
print('Logistic Regression: PRECISION=', log_precision)
print('KNN Classifier: PRECISION=', knn_precision)
print('Decision Tree: PRECISION=', dt_precision)
print('Random Forest: PRECISION=', rf_accuracy)

print('RECALL')
print('Logistic Regression: RECALL=', log_recall)
print('KNN Classifier: RECALL=', knn_recall)
print('Decision Tree: RECALL=', dt_recall)
print('Random Forest: RECALL=', rf_accuracy)
ACCURACY
Logistic Regression: ACCURACY= 0.78125
KNN Classifier: ACCURACY= 0.7430555555555556
Decision Tree: ACCURACY= 0.7638888888888888
Random Forest: ACCURACY= 0.7604166666666666
PRECISION
Logistic Regression: PRECISION= 0.7383512544802867
KNN Classifier: PRECISION= 0.7176470588235294
Decision Tree: PRECISION= 0.7269372693726938
Random Forest: PRECISION= 0.7604166666666666
RECALL
Logistic Regression: RECALL= 0.7953667953667953
KNN Classifier: RECALL= 0.7065637065637066
Decision Tree: RECALL= 0.7606177606177607
Random Forest: RECALL= 0.7604166666666666

MURDERS: K-FOLD CROSS VALIDATION

In [ ]:
# define dataset
X, y = murdclasscopy.loc[:, feature_murd_names], murdclasscopy[class_murd_colname]
# summarize the dataset
print(X.shape, y.shape)
(1919, 50) (1919,)
In [ ]:
from numpy import mean
from numpy import std
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# prepare the cross-validation procedure
cv = KFold(n_splits=10, random_state=42, shuffle=True)
In [ ]:
# evaluate a logistic regression model using k-fold cross-validation

# create model
log_model = LogisticRegression()
# evaluate model
log_scores_acc = cross_val_score(log_model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
log_scores_pre = cross_val_score(log_model, X, y, scoring='precision', cv=cv, n_jobs=-1)
log_scores_re = cross_val_score(log_model, X, y, scoring='recall', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(log_scores_acc), std(log_scores_acc)))
print('Precision: %.3f (%.3f)' % (mean(log_scores_pre), std(log_scores_pre)))
print('Recall: %.3f (%.3f)' % (mean(log_scores_re), std(log_scores_re)))
Accuracy: 0.776 (0.029)
Precision: 0.785 (0.033)
Recall: 0.806 (0.040)
In [ ]:
# evaluate a knn model using k-fold cross-validation

# create model
knn_model = KNeighborsClassifier()
# evaluate model
knn_scores_acc = cross_val_score(knn_model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
knn_scores_pre = cross_val_score(knn_model, X, y, scoring='precision', cv=cv, n_jobs=-1)
knn_scores_re = cross_val_score(knn_model, X, y, scoring='recall', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(knn_scores_acc), std(knn_scores_acc)))
print('Precision: %.3f (%.3f)' % (mean(knn_scores_pre), std(knn_scores_pre)))
print('Recall: %.3f (%.3f)' % (mean(knn_scores_re), std(knn_scores_re)))
Accuracy: 0.752 (0.028)
Precision: 0.759 (0.030)
Recall: 0.795 (0.036)
In [ ]:
# evaluate a decision tree model using k-fold cross-validation

# create model
dt_model = DecisionTreeClassifier(max_depth=5)
# evaluate model
dt_scores_acc = cross_val_score(dt_model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
dt_scores_pre = cross_val_score(dt_model, X, y, scoring='precision', cv=cv, n_jobs=-1)
dt_scores_re = cross_val_score(dt_model, X, y, scoring='recall', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(dt_scores_acc), std(dt_scores_acc)))
print('Precision: %.3f (%.3f)' % (mean(dt_scores_pre), std(dt_scores_pre)))
print('Recall: %.3f (%.3f)' % (mean(dt_scores_re), std(dt_scores_re)))
Accuracy: 0.727 (0.029)
Precision: 0.767 (0.042)
Recall: 0.713 (0.040)
In [ ]:
# evaluate a random forest model using k-fold cross-validation

# create model
rf_model = RandomForestClassifier()
# evaluate model
rf_scores_acc = cross_val_score(rf_model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
rf_scores_pre = cross_val_score(rf_model, X, y, scoring='precision', cv=cv, n_jobs=-1)
rf_scores_re = cross_val_score(rf_model, X, y, scoring='recall', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(rf_scores_acc), std(rf_scores_acc)))
print('Precision: %.3f (%.3f)' % (mean(rf_scores_pre), std(rf_scores_pre)))
print('Recall: %.3f (%.3f)' % (mean(rf_scores_re), std(rf_scores_re)))
Accuracy: 0.768 (0.028)
Precision: 0.794 (0.030)
Recall: 0.786 (0.032)

ROBBERIES DATASET:

In [ ]:
#Create a copy of the dataset

robclasscopy = rob_class_copy.copy()
robclasscopy.head()
Out[ ]:
state pop_bins householdsize_norm racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_norm agePct65up_norm pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_norm PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_norm PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm PctHousLess3BR_norm MedNumBR_bins PctHousOccup_bins PctVacantBoarded_bins PctVacMore6Mos_norm MedYrHousBuilt_bins PctWOFullPlumb_bins RentQrange_bins MedRentPctHousInc_norm MedOwnCostPctInc_norm MedOwnCostPctIncNoMtg_bins PctBornSameState_bins PctSameHouse85_norm PctSameState85_bins LandArea_bins PopDens_bins PctUsePubTrans_bins robbbPerPop_class_target
0 NJ 10000-13500 0.407609 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 0.158370 0.189200 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 0.372916 0.096037 0.090802 0.222845 82-95% 0.316916 7-12% 0.5-1.3% 0.937939 0.086693 2.5-4 96-99% 1.75-4% 0.435135 1956-1964 0.16-0.3% 230-805 0.440594 0.376344 13-14% 50-65% 0.801139 85-90% 0-7 1200-2000 0.5-50% Unlikely
1 PA 19000-29000 0.331522 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 0.129065 0.303659 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 0.381438 0.117016 0.123821 0.247899 82-95% 0.420456 0-7% 0.2-0.5% 0.908541 0.222583 2.5-4 96-99% 0-0.75% 0.192507 1956-1964 0-0.16% 170-230 0.628713 0.354839 12-13% 75-95% 0.890754 93-100% 7-14 2000-3300 0.5-50% Unlikely
2 OR 29000-51500 0.225543 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 0.136090 0.168656 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 0.237681 0.172960 0.469929 0.213508 75-82% 0.655547 12-18% 0.5-1.3% 0.607912 0.481144 2.5-4 94-96% 0.75-1.75% 0.055942 1971-1987 0-0.16% 140-170 0.455446 0.408602 10-12% 0-50% 0.371197 32-85% 7-14 2000-3300 0.5-50% Very Likely
4 MO 515000-7500000 0.230978 0.9-2.8% 90-96% 0.6-1.2% 0.1-0.9% 0.271176 0.226961 100% $8,000-$24,000 0.7-1.0% 0.401525 5-8% 0.252794 6500-10000 10000-15000 8500-12500 5500-8000 7500-1000 17-60% 8-12% 19-30 0.255523 0.336050 0.311189 0.546580 0.330843 67-70% 0.606452 18-65% 0.2-0.5% 0.530849 0.543238 0-2.5 91-94% 1.75-4% 0.292368 1964-1971 0.3-0.5% 0-140 0.569307 0.172043 10-12% 50-65% 0.456466 85-90% 26-4000 1200-2000 0.5-50% Likely
5 MA 19000-29000 0.271739 0.9-2.8% 96-100% 1.2-2.6% 0.9-2.2% 0.132276 0.249658 100% 42,000-$125,000 0-0.5% 0.483379 5-8% 0.327467 15000-250000 15000-500000 12500-17500 11500-137000 13500-55000 0-5% 0-5% 19-30 0.249479 0.342349 0.166200 0.226415 0.373327 75-82% 0.641070 7-12% 0.5-1.3% 0.613235 0.479952 2.5-4 94-96% 0.75-1.75% 0.396532 1939-1956 0.16-0.3% 230-805 0.470297 0.360215 12-13% 75-95% 0.773565 93-100% 7-14 2000-3300 0.5-50% Unlikely
In [ ]:
#Encode the cateogorical variables with LABEL ENCODING METHOD
#This code block is a test run, encoding the column 'state'
#If successful, the rest of the categorical variables will be encoded the same way

#perform label encoding on 'team' column
robclasscopy['state'] = lab.fit_transform(robclasscopy['state'])

robclasscopy.head(3)
Out[ ]:
state pop_bins householdsize_norm racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_norm agePct65up_norm pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_norm PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_norm PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm PctHousLess3BR_norm MedNumBR_bins PctHousOccup_bins PctVacantBoarded_bins PctVacMore6Mos_norm MedYrHousBuilt_bins PctWOFullPlumb_bins RentQrange_bins MedRentPctHousInc_norm MedOwnCostPctInc_norm MedOwnCostPctIncNoMtg_bins PctBornSameState_bins PctSameHouse85_norm PctSameState85_bins LandArea_bins PopDens_bins PctUsePubTrans_bins robbbPerPop_class_target
0 25 10000-13500 0.407609 0.9-2.8% 90-96% 2.7-57.5% 0.9-2.2% 0.158370 0.189200 100% 42,000-$125,000 1.0-7.0% 0.764559 0-3% 0.355054 10000-15000 0-6500 17500-106500 0-5500 13500-55000 0-5% 5-8% 30-80 0.262609 0.372916 0.096037 0.090802 0.222845 82-95% 0.316916 7-12% 0.5-1.3% 0.937939 0.086693 2.5-4 96-99% 1.75-4% 0.435135 1956-1964 0.16-0.3% 230-805 0.440594 0.376344 13-14% 50-65% 0.801139 85-90% 0-7 1200-2000 0.5-50% Unlikely
1 32 19000-29000 0.331522 0-0.8% 90-96% 2.7-57.5% 0.1-0.9% 0.129065 0.303659 100% 42,000-$125,000 1.0-7.0% 0.688453 0-3% 0.461118 15000-250000 0-6500 17500-106500 0-5500 1000-13500 0-5% 5-8% 19-30 0.212797 0.381438 0.117016 0.123821 0.247899 82-95% 0.420456 0-7% 0.2-0.5% 0.908541 0.222583 2.5-4 96-99% 0-0.75% 0.192507 1956-1964 0-0.16% 170-230 0.628713 0.354839 12-13% 75-95% 0.890754 93-100% 7-14 2000-3300 0.5-50% Unlikely
2 31 29000-51500 0.225543 0-0.8% 90-96% 2.7-57.5% 0.9-2.2% 0.136090 0.168656 100% $32,000-$42,000 1.0-7.0% 0.583729 0-3% 0.263971 15000-250000 15000-500000 12500-17500 5500-8000 7500-1000 0-5% 0-5% 30-80 0.289704 0.237681 0.172960 0.469929 0.213508 75-82% 0.655547 12-18% 0.5-1.3% 0.607912 0.481144 2.5-4 94-96% 0.75-1.75% 0.055942 1971-1987 0-0.16% 140-170 0.455446 0.408602 10-12% 0-50% 0.371197 32-85% 7-14 2000-3300 0.5-50% Very Likely
In [ ]:
#Encode the rest of the categorical variables with for loop function

for r_cols in robclasscopy.columns:
    if robclasscopy[r_cols].dtype == 'category':
        robclasscopy[r_cols] = lab.fit_transform(robclasscopy[r_cols])
    else:
       pass
robclasscopy.head()
Out[ ]:
state pop_bins householdsize_norm racepctblack_bins racePctWhite_bins racePctAsian_bins racePctHisp_bins agePct12t21_norm agePct65up_norm pctUrban_bins medIncome_bins pctWFarmSelf_bins pctWInvInc_norm pctWPubAsst_bins pctWRetire_norm blackPerCap_bins indianPerCap_bins AsianPerCap_bins OtherPerCap_bins HispPerCap_bins PctPopUnderPov_bins PctLess9thGrade_bins PctBSorMore_bins PctEmplManu_norm PctEmplProfServ_norm PctOccupManu_norm MalePctDivorce_norm MalePctNevMarr_norm PctFam2Par_bins PctWorkMomYoungKids_norm PctImmigRecent_bins PctRecentImmig_bins PctPersOwnOccup_norm PctHousLess3BR_norm MedNumBR_bins PctHousOccup_bins PctVacantBoarded_bins PctVacMore6Mos_norm MedYrHousBuilt_bins PctWOFullPlumb_bins RentQrange_bins MedRentPctHousInc_norm MedOwnCostPctInc_norm MedOwnCostPctIncNoMtg_bins PctBornSameState_bins PctSameHouse85_norm PctSameState85_bins LandArea_bins PopDens_bins PctUsePubTrans_bins robbbPerPop_class_target
0 25 0 0.407609 1 2 3 1 0.158370 0.189200 2 3 3 0.764559 0 0.355054 1 0 2 0 2 0 2 3 0.262609 0.372916 0.096037 0.090802 0.222845 3 0.316916 3 2 0.937939 0.086693 1 3 2 0.435135 1 1 3 0.440594 0.376344 2 1 0.801139 1 0 1 1 1
1 32 2 0.331522 0 2 3 0 0.129065 0.303659 2 3 3 0.688453 0 0.461118 2 0 2 0 1 0 2 2 0.212797 0.381438 0.117016 0.123821 0.247899 3 0.420456 0 1 0.908541 0.222583 1 3 0 0.192507 1 0 2 0.628713 0.354839 1 3 0.890754 3 3 2 1 1
2 31 3 0.225543 0 2 3 1 0.136090 0.168656 2 1 3 0.583729 0 0.263971 2 2 1 2 3 0 0 3 0.289704 0.237681 0.172960 0.469929 0.213508 2 0.655547 1 2 0.607912 0.481144 1 2 1 0.055942 3 0 1 0.455446 0.408602 0 0 0.371197 0 3 2 1 2
4 20 4 0.230978 1 2 1 0 0.271176 0.226961 2 2 2 0.401525 2 0.252794 3 1 3 2 3 2 3 2 0.255523 0.336050 0.311189 0.546580 0.330843 1 0.606452 2 1 0.530849 0.543238 0 1 2 0.292368 2 2 0 0.569307 0.172043 0 1 0.456466 1 2 1 1 0
5 16 2 0.271739 1 3 2 1 0.132276 0.249658 2 3 0 0.483379 2 0.327467 2 2 1 1 2 0 0 2 0.249479 0.342349 0.166200 0.226415 0.373327 2 0.641070 3 2 0.613235 0.479952 1 2 1 0.396532 0 1 3 0.470297 0.360215 1 3 0.773565 3 3 2 1 1
In [ ]:
#check info of our dataframe to ensure it is of the correct type, and that all of the categorical attributes were transformed

robclasscopy.info()
In [ ]:
#Split the data into train test split
from sklearn.model_selection import train_test_split

#Split the dataset into training set and test set
#Our class column is murdPerPop_class_target, everything else will be used as features
class_rob_colname = 'robbbPerPop_class_target'
feature_rob_names = robclasscopy.columns[robclasscopy.columns != class_rob_colname]

#70% training and 30% test
x2_train, x2_test, y2_train, y2_test = train_test_split(robclasscopy.loc[:, feature_rob_names], 
                                                    robclasscopy[class_rob_colname], test_size=0.3, random_state=42)
In [ ]:
#LOGISTIC REGRESSION

from sklearn.linear_model import LogisticRegression

rob_logmodel = LogisticRegression(class_weight= 'balanced')
logmodel.fit(x2_train, y2_train)
rob_pred_log = logmodel.predict(x2_test)

rob_pred_log
In [ ]:
#KNN ALGORITHM

from sklearn.neighbors import KNeighborsClassifier 
 
rob_classifier = KNeighborsClassifier(n_neighbors=5) 
classifier.fit(x2_train, y2_train) 

rob_pred = classifier.predict(x2_test)  
rob_pred
In [ ]:
#DECISION TREE

from matplotlib import pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

#create and train the model

r_clf = DecisionTreeClassifier(max_depth=5, random_state=1234)
r_model = r_clf.fit(x2_train, y2_train)

#Predict the response for test dataset
rob_treepred = r_clf.predict(x2_test)
rob_treepred
In [ ]:
from sklearn.tree import plot_tree

#plot tree (visual representation)

r_features = feature_rob_names
r_classes = class_rob_colname

plt.figure(figsize=(40, 40))
plot_tree(r_clf,
          fontsize=10,
          feature_names=r_features,
          class_names=r_classes,
          rounded=True,
          filled=True,
          proportion=True); 
In [ ]:
#plot tree (textual representation)

text_representation = tree.export_text(r_clf)
print(text_representation)
In [ ]:
#RANDOM FOREST
from sklearn.ensemble import RandomForestClassifier

r_randf=RandomForestClassifier()
r_randf.fit(x2_train, y2_train)

rob_rf_pred = r_randf.predict(x1_test)
rob_rf_pred
In [ ]:
#CONFUSION MATRICES

rlog_cm = confusion_matrix(y2_test, rob_pred_log)
rknn_cm = confusion_matrix(y2_test, rob_pred)
rdt_cm = confusion_matrix(y2_test, rob_treepred)
rrf_cm = confusion_matrix(y2_test, rob_rf_pred)

print('LOGISTIC REGRESSION CONFUSION MATRIX')
print(rlog_cm)
print('KNN CONFUSION MATRIX')
print(rknn_cm)
print('DECISION TREE CONFUSION MATRIX')
print(rdt_cm)
print('RANDOM FOREST CONFUSION MATRIX')
print(rrf_cm)
LOGISTIC REGRESSION CONFUSION MATRIX
[[112  52  34]
 [ 43 141   3]
 [ 31   4 156]]
KNN CONFUSION MATRIX
[[100  56  42]
 [ 55 125   7]
 [ 37   6 148]]
DECISION TREE CONFUSION MATRIX
[[120  43  35]
 [ 64 119   4]
 [ 32   5 154]]
RANDOM FOREST CONFUSION MATRIX
[[127  41  30]
 [ 39 147   1]
 [ 27   2 162]]
In [ ]:
#EVALUATION METRICS
from sklearn import metrics

rlog_accuracy = (metrics.accuracy_score(y2_test, rob_pred_log))
rlog_precision = (metrics.precision_score(y2_test, rob_pred_log, average='macro'))
rlog_recall = (metrics.recall_score(y2_test, rob_pred_log, average='macro'))

rknn_accuracy = (metrics.accuracy_score(y2_test, rob_pred))
rknn_precision = (metrics.precision_score(y2_test, rob_pred, average='macro'))
rknn_recall = (metrics.recall_score(y2_test, rob_pred, average='macro'))

rdt_accuracy = (metrics.accuracy_score(y2_test, rob_treepred))
rdt_precision = (metrics.precision_score(y2_test, rob_treepred, average='macro'))
rdt_recall = (metrics.recall_score(y2_test, rob_treepred, average='macro'))


rrf_accuracy = (metrics.accuracy_score(y2_test, rob_rf_pred))
rrf_precision = (metrics.precision_score(y2_test, rob_rf_pred, average='macro'))
rrf_recall = (metrics.recall_score(y2_test, rob_rf_pred, average='macro'))

print('ACCURACY')
print('Logistic Regression: ACCURACY=', rlog_accuracy)
print('KNN Classifier: ACCURACY=', rknn_accuracy)
print('Decision Tree: ACCURACY=', rdt_accuracy)
print('Random Forest: ACCURACY=', rrf_accuracy)

print('PRECISION')
print('Logistic Regression: PRECISION=', rlog_precision)
print('KNN Classifier: PRECISION=', rknn_precision)
print('Decision Tree: PRECISION=', rdt_precision)
print('Random Forest: PRECISION=', rrf_accuracy)

print('RECALL')
print('Logistic Regression: RECALL=', rlog_recall)
print('KNN Classifier: RECALL=', rknn_recall)
print('Decision Tree: RECALL=', rdt_recall)
print('Random Forest: RECALL=', rrf_accuracy)
ACCURACY
Logistic Regression: ACCURACY= 0.7100694444444444
KNN Classifier: ACCURACY= 0.6475694444444444
Decision Tree: ACCURACY= 0.6822916666666666
Random Forest: ACCURACY= 0.7569444444444444
PRECISION
Logistic Regression: PRECISION= 0.7087255778946533
KNN Classifier: PRECISION= 0.6468505222424303
Decision Tree: PRECISION= 0.688685955664951
Random Forest: PRECISION= 0.7569444444444444
RECALL
Logistic Regression: RECALL= 0.7121403958484341
KNN Classifier: RECALL= 0.6494562709530373
Decision Tree: RECALL= 0.6829023216457771
Random Forest: RECALL= 0.7569444444444444

ROBBERIES: K-FOLD CROSS VALIDATION

In [ ]:
# define dataset
X1, y1 = robclasscopy.loc[:, feature_rob_names], robclasscopy[class_rob_colname]
# summarize the dataset
print(X1.shape, y1.shape)
(1919, 50) (1919,)
In [ ]:
# evaluate a logistic regression model using k-fold cross-validation

# prepare the cross-validation procedure
cv = KFold(n_splits=10, random_state=42, shuffle=True)
# create model
log_model = LogisticRegression()
# evaluate model
rlog_scores_acc = cross_val_score(log_model, X1, y1, scoring='accuracy', cv=cv, n_jobs=-1)
rlog_scores_pre = cross_val_score(log_model, X1, y1, scoring='precision', cv=cv, n_jobs=-1)
rlog_scores_re = cross_val_score(log_model, X1, y1, scoring='recall', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(rlog_scores_acc), std(rlog_scores_acc)))
print('Precision:',(metrics.precision_score(y2_test, rob_pred_log, average = 'macro')))
print('Recall:',(metrics.recall_score(y2_test, rob_pred_log, average = 'macro')))
Accuracy: 0.705 (0.042)
Precision: 0.7087255778946533
Recall: 0.7121403958484341
In [ ]:
# evaluate a knn model using k-fold cross-validation

# create model
knn_model = KNeighborsClassifier()
# evaluate model
rknn_scores_acc = cross_val_score(knn_model, X1, y1, scoring='accuracy', cv=cv, n_jobs=-1)
rknn_scores_pre = cross_val_score(knn_model, X1, y1, scoring='precision', cv=cv, n_jobs=-1)
rknn_scores_re = cross_val_score(knn_model, X1, y1, scoring='recall', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(rknn_scores_acc), std(rknn_scores_acc)))
print('Precision:',(metrics.precision_score(y2_test, rob_pred, average = 'macro')))
print('Recall:',(metrics.recall_score(y2_test, rob_pred, average = 'macro')))
Accuracy: 0.646 (0.031)
Precision: 0.6468505222424303
Recall: 0.6494562709530373
In [ ]:
# evaluate a decision tree model using k-fold cross-validation

# create model
dt_model = DecisionTreeClassifier(max_depth=5)
# evaluate model
rdt_scores_acc = cross_val_score(dt_model, X1, y1, scoring='accuracy', cv=cv, n_jobs=-1)
rdt_scores_pre = cross_val_score(dt_model, X1, y1, scoring='precision', cv=cv, n_jobs=-1)
rdt_scores_re = cross_val_score(dt_model, X1, y1, scoring='recall', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(rdt_scores_acc), std(rdt_scores_acc)))
print('Precision:',(metrics.precision_score(y2_test, rob_treepred, average = 'macro')))
print('Recall:', (metrics.recall_score(y2_test, rob_treepred, average = 'macro')))
Accuracy: 0.663 (0.027)
Precision: 0.688685955664951
Recall: 0.6829023216457771
In [ ]:
# evaluate a random forest model using k-fold cross-validation

# create model
rf_model = RandomForestClassifier()
# evaluate model
rrf_scores_acc = cross_val_score(rf_model, X1, y1, scoring='accuracy', cv=cv, n_jobs=-1)
rrf_scores_pre = cross_val_score(rf_model, X1, y1, scoring='precision', cv=cv, n_jobs=-1)
rrf_scores_re = cross_val_score(rf_model, X1, y1, scoring='recall', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(rrf_scores_acc),std(rdt_scores_acc)))
print('Precision:',(metrics.precision_score(y2_test, rob_rf_pred, average = 'macro')))
print('Recall:', (metrics.recall_score(y2_test, rob_rf_pred, average = 'macro')))
Accuracy: 0.714 (0.026)
Precision: 0.7570311789837287
Recall: 0.7585593124552164